refactor [llm]

evalstate · evalstate · commit c0461ef33db6 · 2025-11-30T18:54:45.000Z
diff --git a/hf_model_evaluation/scripts/evaluation_manager.py b/hf_model_evaluation/scripts/evaluation_manager.py
@@ -20,6 +20,7 @@
 """
 
 import argparse
+import json
 import os
 import re
 from typing import Any, Dict, List, Optional, Tuple
@@ -130,6 +131,20 @@ def normalize_model_name(name: str) -> tuple[set[str], str]:
     return tokens, normalized
 
 
+def parse_numeric_cell(cell: Optional[str]) -> Optional[float]:
+    """Parse a table cell into a float if possible."""
+    if not cell:
+        return None
+
+    try:
+        value_str = cell.replace("%", "").replace(",", "").strip()
+        if not value_str:
+            return None
+        return float(value_str)
+    except (AttributeError, ValueError):
+        return None
+
+
 def find_main_model_column(header: List[str], model_name: str) -> Optional[int]:
     """
     Identify the column index that corresponds to the main model.
@@ -333,42 +348,31 @@ def extract_metrics_from_table(
 
             # If we identified a specific column, use it; otherwise use first numeric value
             if target_column is not None and target_column < len(row):
-                try:
-                    value_str = row[target_column].replace("%", "").replace(",", "").strip()
-                    if value_str:
-                        value = float(value_str)
-                        metrics.append({
-                            "name": benchmark_name,
-                            "type": benchmark_name.lower().replace(" ", "_"),
-                            "value": value
-                        })
-                except (ValueError, IndexError):
-                    pass
+                value = parse_numeric_cell(row[target_column])
+                if value is not None:
+                    metrics.append({
+                        "name": benchmark_name,
+                        "type": benchmark_name.lower().replace(" ", "_"),
+                        "value": value
+                    })
             else:
                 # Extract numeric values from remaining columns (original behavior)
                 for i, cell in enumerate(row[1:], start=1):
-                    try:
-                        # Remove common suffixes and convert to float
-                        value_str = cell.replace("%", "").replace(",", "").strip()
-                        if not value_str:
-                            continue
-
-                        value = float(value_str)
-
-                        # Determine metric name
-                        metric_name = benchmark_name
-                        if len(header) > i and header[i].lower() not in ["score", "value", "result"]:
-                            metric_name = f"{benchmark_name} ({header[i]})"
-
-                        metrics.append({
-                            "name": metric_name,
-                            "type": benchmark_name.lower().replace(" ", "_"),
-                            "value": value
-                        })
-                        break  # Only take first numeric value per row
-                    except (ValueError, IndexError):
+                    value = parse_numeric_cell(cell)
+                    if value is None:
                         continue
 
+                    metric_name = benchmark_name
+                    if len(header) > i and header[i].lower() not in ["score", "value", "result"]:
+                        metric_name = f"{benchmark_name} ({header[i]})"
+
+                    metrics.append({
+                        "name": metric_name,
+                        "type": benchmark_name.lower().replace(" ", "_"),
+                        "value": value
+                    })
+                    break  # Only take first numeric value per row
+
     elif table_format == "transposed":
         # Models are in rows (first column), benchmarks are in columns (header)
         # Find the row that matches the target model
@@ -397,20 +401,13 @@ def extract_metrics_from_table(
             if not benchmark_name or i >= len(target_row):
                 continue
 
-            try:
-                value_str = target_row[i].replace("%", "").replace(",", "").strip()
-                if not value_str:
-                    continue
-
-                value = float(value_str)
-
+            value = parse_numeric_cell(target_row[i])
+            if value is not None:
                 metrics.append({
                     "name": benchmark_name,
                     "type": benchmark_name.lower().replace(" ", "_").replace("-", "_"),
                     "value": value
                 })
-            except (ValueError, AttributeError):
-                continue
 
     else:  # table_format == "columns"
         # Benchmarks are in columns
@@ -424,20 +421,13 @@ def extract_metrics_from_table(
             if not benchmark_name or i >= len(data_row):
                 continue
 
-            try:
-                value_str = data_row[i].replace("%", "").replace(",", "").strip()
-                if not value_str:
-                    continue
-
-                value = float(value_str)
-
+            value = parse_numeric_cell(data_row[i])
+            if value is not None:
                 metrics.append({
                     "name": benchmark_name,
                     "type": benchmark_name.lower().replace(" ", "_"),
                     "value": value
                 })
-            except ValueError:
-                continue
 
     return metrics
 
@@ -598,6 +588,35 @@ def extract_tables_with_parser(markdown_content: str) -> List[Dict[str, Any]]:
     return tables
 
 
+def format_model_index(repo_id: str, results: List[Dict[str, Any]], output_format: str = "yaml") -> str:
+    """Return model-index payload as YAML or JSON for easy consumption."""
+    payload = {
+        "model-index": [
+            {
+                "name": repo_id.split("/")[-1] if "/" in repo_id else repo_id,
+                "results": results,
+            }
+        ]
+    }
+
+    if output_format == "json":
+        return json.dumps(payload, indent=2)
+    return yaml.dump(payload, sort_keys=False)
+
+
+def build_extract_command(repo_id: str, table_number: int, model_name_override: Optional[str]) -> str:
+    """Construct the suggested extract-readme command."""
+    parts = [
+        "python scripts/evaluation_manager.py extract-readme",
+        f'--repo-id "{repo_id}"',
+        f"--table {table_number}",
+    ]
+    if model_name_override:
+        parts.append(f'--model-name-override "{model_name_override}"')
+    parts.append("--dry-run")
+    return " \\\n  ".join(parts)
+
+
 def detect_table_format(table: Dict[str, Any], repo_id: str) -> Dict[str, Any]:
     """Analyze a table to detect its format and identify model columns."""
     headers = table.get("headers", [])
@@ -662,7 +681,7 @@ def detect_table_format(table: Dict[str, Any], repo_id: str) -> Dict[str, Any]:
     }
 
 
-def inspect_tables(repo_id: str) -> None:
+def inspect_tables(repo_id: str, output_format: str = "text") -> None:
     """Inspect and display all evaluation tables in a model's README."""
     try:
         card = ModelCard.load(repo_id, token=HF_TOKEN)
@@ -678,76 +697,80 @@ def inspect_tables(repo_id: str) -> None:
             print(f"No tables found in README for {repo_id}")
             return
 
-        print(f"\n{'='*70}")
-        print(f"Tables found in README for: {repo_id}")
-        print(f"{'='*70}")
+        summary: Dict[str, Any] = {"repo_id": repo_id, "tables": []}
+
+        if output_format == "text":
+            print(f"\n{'='*70}")
+            print(f"Tables found in README for: {repo_id}")
+            print(f"{'='*70}")
 
         eval_table_count = 0
-        for table in tables:
+        for idx, table in enumerate(tables, start=1):
             analysis = detect_table_format(table, repo_id)
 
             if analysis["format"] == "unknown" and not analysis.get("sample_rows"):
                 continue
 
             eval_table_count += 1
-            print(f"\n## Table {eval_table_count}")
+
+            override_value = None
+            if analysis["format"] == "comparison":
+                exact = next((c for c in analysis.get("model_columns", []) if c["is_exact_match"]), None)
+                if exact:
+                    override_value = exact["header"]
+                else:
+                    partial = next((c for c in analysis.get("model_columns", []) if c["is_partial_match"]), None)
+                    override_value = partial["header"] if partial else None
+
+            suggested_command = build_extract_command(repo_id, idx, override_value)
+
+            table_summary = {
+                "table_number": idx,
+                "format": analysis["format"],
+                "row_count": analysis["row_count"],
+                "columns": analysis["columns"],
+                "model_columns": analysis.get("model_columns", []),
+                "sample_rows": analysis.get("sample_rows", []),
+                "suggested_command": suggested_command,
+            }
+            summary["tables"].append(table_summary)
+
+            if output_format == "json":
+                continue
+
+            print(f"\n## Table {idx}")
             print(f"   Format: {analysis['format']}")
             print(f"   Rows: {analysis['row_count']}")
 
             print(f"\n   Columns ({len(analysis['columns'])}):")
             for col_info in analysis.get("model_columns", []):
-                idx = col_info["index"]
+                col_idx = col_info["index"]
                 header = col_info["header"]
                 if col_info["is_exact_match"]:
-                    print(f"      [{idx}] {header}  ✓ EXACT MATCH")
+                    print(f"      [{col_idx}] {header}  ✓ EXACT MATCH")
                 elif col_info["is_partial_match"]:
-                    print(f"      [{idx}] {header}  ~ partial match")
+                    print(f"      [{col_idx}] {header}  ~ partial match")
                 else:
-                    print(f"      [{idx}] {header}")
+                    print(f"      [{col_idx}] {header}")
 
             if analysis.get("sample_rows"):
                 print(f"\n   Sample rows (first column):")
                 for row_val in analysis["sample_rows"][:5]:
                     print(f"      - {row_val}")
 
-            # Build suggested command
-            cmd_parts = [
-                "python scripts/evaluation_manager.py extract-readme",
-                f'--repo-id "{repo_id}"',
-                f"--table {eval_table_count}"
-            ]
+            if override_value and not any(c["is_exact_match"] for c in analysis.get("model_columns", [])):
+                print(f"\n   ⚠ No exact match. Best candidate: {override_value}")
 
-            override_value = None
-            if analysis["format"] == "comparison":
-                exact = next((c for c in analysis.get("model_columns", []) if c["is_exact_match"]), None)
-                if exact:
-                    print(f"\n   ✓ Column match: {exact['header']}")
-                else:
-                    partial = next((c for c in analysis.get("model_columns", []) if c["is_partial_match"]), None)
-                    if partial:
-                        override_value = partial["header"]
-                        print(f"\n   ⚠ No exact match. Best candidate: {partial['header']}")
-                    elif analysis.get("model_columns"):
-                        print(f"\n   ⚠ Could not identify model column. Options:")
-                        for col_info in analysis.get("model_columns", []):
-                            print(f'      "{col_info["header"]}"')
-                        override_value = analysis["model_columns"][0]["header"]
-
-            if override_value:
-                cmd_parts.append(f'--model-name-override "{override_value}"')
-
-            cmd_parts.append("--dry-run")
-
-            print(f"\n   Suggested command:")
-            print(f"      {cmd_parts[0]} \\")
-            for part in cmd_parts[1:-1]:
-                print(f"        {part} \\")
-            print(f"        {cmd_parts[-1]}")
-
-        if eval_table_count == 0:
+            print(f"\n   Suggested command:\n      {suggested_command}")
+
+        if eval_table_count == 0 and output_format == "text":
             print("\nNo evaluation tables detected.")
 
-        print(f"\n{'='*70}\n")
+        if output_format == "json":
+            print(json.dumps(summary, indent=2))
+
+        if output_format == "text":
+            print(f"\n{'='*70}\n")
 
     except Exception as e:
         print(f"Error inspecting tables: {e}")
@@ -1065,6 +1088,12 @@ def main():
     extract_parser.add_argument("--dataset-type", type=str, default="benchmark", help="Dataset type")
     extract_parser.add_argument("--create-pr", action="store_true", help="Create PR instead of direct push")
     extract_parser.add_argument("--dry-run", action="store_true", help="Preview YAML without updating")
+    extract_parser.add_argument(
+        "--output-format",
+        choices=["yaml", "json"],
+        default="yaml",
+        help="Output format for --dry-run"
+    )
 
     # Import from AA command
     aa_parser = subparsers.add_parser(
@@ -1104,6 +1133,12 @@ def main():
 """
     )
     inspect_parser.add_argument("--repo-id", type=str, required=True, help="HF repository ID")
+    inspect_parser.add_argument(
+        "--output-format",
+        choices=["text", "json"],
+        default="text",
+        help="Choose machine-readable JSON for LLM workflows"
+    )
 
     args = parser.parse_args()
 
@@ -1128,7 +1163,7 @@ def main():
 
         if args.dry_run:
             print("\nPreview of extracted evaluations:")
-            print(yaml.dump({"model-index": [{"name": args.repo_id.split("/")[-1], "results": results}]}, sort_keys=False))
+            print(format_model_index(args.repo_id, results, args.output_format))
         else:
             update_model_card_with_evaluations(
                 repo_id=args.repo_id,
@@ -1162,7 +1197,7 @@ def main():
         validate_model_index(args.repo_id)
 
     elif args.command == "inspect-tables":
-        inspect_tables(args.repo_id)
+        inspect_tables(args.repo_id, output_format=args.output_format)
 
 
 if __name__ == "__main__":