NVIDIA-NeMo
diff --git a/‎.github/config/requirements.txt‎
Lines changed: 3 additions & 13 deletions b/‎.github/config/requirements.txt‎
Lines changed: 3 additions & 13 deletions
diff --git a/‎.github/workflows/cicd-main.yml‎
Lines changed: 32 additions & 4 deletions b/‎.github/workflows/cicd-main.yml‎
Lines changed: 32 additions & 4 deletions
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 1 deletion b/‎.gitignore‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 27 additions & 1 deletion b/‎README.md‎
Lines changed: 27 additions & 1 deletion
@@ -9,18 +9,8 @@ nvidia-sphinx-theme # Our NVIDIA theme
 sphinxcontrib-mermaid # For mermaid diagrams
 myst-parser # For our markdown docs
 sphinx-design # For our design elements
-sphinxcontrib-mermaid # For mermaid diagrams
 swagger-plugin-for-sphinx # For Swagger API documentation
 
-
-flask
-jinja2
-psutil
-pydantic
-pydantic-core>=2.17
-requests
-structlog
-typing-extensions>=4.0.0
-pyyaml
-werkzeug
-yq
+# Editable local dependencies for autogen scripts and API docs
+-e packages/nemo-evaluator
+-e packages/nemo-evaluator-launcher
@@ -68,9 +68,36 @@ jobs:
           pre-commit install
           pre-commit run --all-files --show-diff-on-failure --color=always
 
-  cicd-wait-in-queue:
+  validate-container-digests:
     runs-on: ubuntu-latest
     needs: [pre-flight, linting]
+    if: |
+      (
+        needs.pre-flight.outputs.is_deployment_workflow == 'false'
+          && needs.pre-flight.outputs.is_ci_workload == 'true'
+      ) || (
+        needs.pre-flight.outputs.is_deployment_workflow == 'false'
+          && needs.pre-flight.outputs.is_ci_workload == 'false'
+          && needs.pre-flight.outputs.docs_only == 'false'
+      )
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+      - name: Install package dependencies
+        run: |
+          cd packages/nemo-evaluator-launcher
+          pip install -e .
+      - name: Validate container digests
+        run: |
+          make container-metadata-verify
+
+  cicd-wait-in-queue:
+    runs-on: ubuntu-latest
+    needs: [pre-flight, linting, validate-container-digests]
     environment: test
     if: |
       !(needs.pre-flight.outputs.is_ci_workload == 'true'
@@ -82,7 +109,7 @@ jobs:
           echo "Running CI tests"
 
   cicd-unit-tests-nemo-evaluator:
-    needs: [pre-flight, cicd-wait-in-queue]
+    needs: [pre-flight, cicd-wait-in-queue, validate-container-digests]
     runs-on: ubuntu-latest
     name: unit-tests
     environment: nemo-ci
@@ -106,7 +133,7 @@ jobs:
           package: nemo-evaluator
 
   cicd-unit-tests-nemo-evaluator-launcher:
-    needs: [pre-flight, cicd-wait-in-queue]
+    needs: [pre-flight, cicd-wait-in-queue, validate-container-digests]
     runs-on: ubuntu-latest
     name: unit-tests-launcher
     environment: nemo-ci
@@ -130,7 +157,8 @@ jobs:
           package: nemo-evaluator-launcher
 
   cicd-e2e-tests-nemo-evaluator:
-    needs: [pre-flight, cicd-unit-tests-nemo-evaluator]
+    needs:
+      [pre-flight, cicd-unit-tests-nemo-evaluator, validate-container-digests]
     runs-on: ubuntu-latest
     name: functional-tests
     environment: nemo-ci
 
@@ -183,4 +183,7 @@ nemo_experiments/
 
 slurm*.out
 
-docs/**/generated/**/*
+docs/**/generated/**/*
+
+# Autogenerated task documentation (regenerated during build)
+docs/evaluation/benchmarks/catalog/all/**
@@ -11,7 +11,7 @@
 [![nemo-evaluator-launcher PyPI downloads](https://img.shields.io/pypi/dm/nemo-evaluator-launcher.svg)](https://pypi.org/project/nemo-evaluator-launcher/)
 [![Project Status](https://img.shields.io/badge/Status-Production%20Ready-green)](#)
 
-## [📖 Documentation](https://docs.nvidia.com/nemo/evaluator/latest/) 
+## [📖 Documentation](https://docs.nvidia.com/nemo/evaluator/latest/)
 
 NeMo Evaluator SDK is an open-source platform for robust, reproducible, and scalable evaluation of Large Language Models. It enables you to run hundreds of benchmarks across popular evaluation harnesses against any OpenAI-compatible model API. Evaluations execute in open-source Docker containers for auditable and trustworthy results. The platform's containerized architecture allows for the rapid integration of public benchmarks and private datasets.
 
@@ -58,6 +58,32 @@ NeMo Evaluator Launcher provides pre-built evaluation containers for different e
 | **tooltalk** | Tool usage evaluation | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/tooltalk) | `25.11` | ToolTalk |
 | **vlmevalkit** | Vision-language model evaluation | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/vlmevalkit) | `25.11` | AI2D, ChartQA, MMMU, MathVista-MINI, OCRBench, SlideVQA |
 
+<!-- BEGIN AUTOGENERATION -->
+<!-- mapping toml checksum: sha256:b7fdaa7f01a641970f864c6aab95d7f9e49b883dee8558e8636eb8018a01388e -->
+<!--
+| Container | Description | NGC Catalog | Latest Tag | Supported benchmarks |
+|-----------|-------------|-------------|------------| ------------|
+| **bfcl** | The Berkeley Function Calling Leaderboard V3 (also called Berkeley Tool Calling Leaderboard V3) evaluates the LLM's ability to call functions (aka tools) accurately. | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/bfcl?version=25.11) | `25.11` | bfclv3, bfclv3_ast, bfclv3_ast_prompting, bfclv2, bfclv2_ast, bfclv2_ast_prompting |
+| **bigcode-evaluation-harness** | A framework for the evaluation of autoregressive code generation language models. | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/bigcode-evaluation-harness?version=25.11) | `25.11` | humaneval, humaneval_instruct, humanevalplus, mbpp, mbppplus, mbppplus_nemo, multiple-py, multiple-sh, multiple-cpp, multiple-cs, multiple-d, multiple-go, multiple-java, multiple-js, multiple-jl, multiple-lua, multiple-pl, multiple-php, multiple-r, multiple-rkt, multiple-rb, multiple-rs, multiple-scala, multiple-swift, multiple-ts |
+| **garak** | Garak is an LLM vulnerability scanner. | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/garak?version=25.11) | `25.11` | garak |
+| **genai_perf_eval** | GenAI Perf is a tool to evaluate the performance of LLM endpoints. | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/genai-perf?version=25.11) | `25.11` | genai_perf_summarization, genai_perf_generation |
+| **helm** | A framework for evaluating large language models in medical applications across various healthcare tasks | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/helm?version=25.11) | `25.11` | medcalc_bench, medec, head_qa, medbullets, pubmed_qa, ehr_sql, race_based_med, medhallu, mtsamples_replicate, aci_bench, mtsamples_procedures, medication_qa, med_dialog_healthcaremagic, med_dialog_icliniq, medi_qa |
+| **hle** | Humanity's Last Exam (HLE) is a multi-modal benchmark at the frontier of human knowledge, designed to be the final closed-ended academic benchmark of its kind with broad subject coverage. Humanity's Last Exam consists of 3,000 questions across dozens of subjects, including mathematics, humanities, and the natural sciences. HLE is developed globally by subject-matter experts and consists of multiple-choice and short-answer questions suitable for automated grading. | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/hle?version=25.11) | `25.11` | hle, hle_aa_v2 |
+| **ifbench** | IFBench is a new, challenging benchmark for precise instruction following. | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/ifbench?version=25.11) | `25.11` | ifbench, ifbench_aa_v2 |
+| **livecodebench** | Holistic and Contamination Free Evaluation of Large Language Models for Code. | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/livecodebench?version=25.11) | `25.11` | codegeneration_release_latest, codegeneration_release_v1, codegeneration_release_v2, codegeneration_release_v3, codegeneration_release_v4, codegeneration_release_v5, codegeneration_release_v6, codegeneration_notfast, testoutputprediction, codeexecution_v2, codeexecution_v2_cot, livecodebench_0724_0125, livecodebench_aa_v2, livecodebench_0824_0225 |
+| **lm-evaluation-harness** | This project provides a unified framework to test generative language models on a large number of different evaluation tasks. | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/lm-evaluation-harness?version=25.11) | `25.11` | mmlu, mmlu_instruct, mmlu_cot_0_shot_chat, ifeval, mmlu_pro, mmlu_pro_instruct, mmlu_redux, mmlu_redux_instruct, m_mmlu_id_str, gsm8k, gsm8k_cot_instruct, gsm8k_cot_zeroshot, gsm8k_cot_llama, gsm8k_cot_zeroshot_llama, humaneval_instruct, mbpp_plus, mgsm, mgsm_cot, wikilingua, winogrande, arc_challenge, arc_challenge_chat, hellaswag, truthfulqa, bbh, bbh_instruct, musr, gpqa, gpqa_diamond_cot, frames_naive, frames_naive_with_links, frames_oracle, commonsense_qa, openbookqa, mmlu_logits, piqa, social_iqa, adlr_agieval_en_cot, adlr_math_500_4_shot_sampled, adlr_race, adlr_truthfulqa_mc2, adlr_arc_challenge_llama_25_shot, adlr_gpqa_diamond_cot_5_shot, adlr_mmlu, adlr_mmlu_pro_5_shot_base, adlr_minerva_math_nemo_4_shot, adlr_gsm8k_cot_8_shot, adlr_humaneval_greedy, adlr_humaneval_sampled, adlr_mbpp_sanitized_3_shot_greedy, adlr_mbpp_sanitized_3_shot_sampled, adlr_global_mmlu_lite_5_shot, adlr_mgsm_native_cot_8_shot, adlr_commonsense_qa_7_shot, adlr_winogrande_5_shot, bbq, arc_multilingual, hellaswag_multilingual, mmlu_prox, mmlu_prox_fr, mmlu_prox_de, mmlu_prox_it, mmlu_prox_ja, mmlu_prox_es, global_mmlu_full, global_mmlu_full_am, global_mmlu_full_ar, global_mmlu_full_bn, global_mmlu_full_cs, global_mmlu_full_de, global_mmlu_full_el, global_mmlu_full_en, global_mmlu_full_es, global_mmlu_full_fa, global_mmlu_full_fil, global_mmlu_full_fr, global_mmlu_full_ha, global_mmlu_full_he, global_mmlu_full_hi, global_mmlu_full_id, global_mmlu_full_ig, global_mmlu_full_it, global_mmlu_full_ja, global_mmlu_full_ko, global_mmlu_full_ky, global_mmlu_full_lt, global_mmlu_full_mg, global_mmlu_full_ms, global_mmlu_full_ne, global_mmlu_full_nl, global_mmlu_full_ny, global_mmlu_full_pl, global_mmlu_full_pt, global_mmlu_full_ro, global_mmlu_full_ru, global_mmlu_full_si, global_mmlu_full_sn, global_mmlu_full_so, global_mmlu_full_sr, global_mmlu_full_sv, global_mmlu_full_sw, global_mmlu_full_te, global_mmlu_full_tr, global_mmlu_full_uk, global_mmlu_full_vi, global_mmlu_full_yo, global_mmlu_full_zh, global_mmlu, global_mmlu_ar, global_mmlu_bn, global_mmlu_de, global_mmlu_en, global_mmlu_es, global_mmlu_fr, global_mmlu_hi, global_mmlu_id, global_mmlu_it, global_mmlu_ja, global_mmlu_ko, global_mmlu_pt, global_mmlu_sw, global_mmlu_yo, global_mmlu_zh, agieval |
+| **mmath** | MMATH is a new benchmark specifically designed for multilingual complex reasoning. It comprises 374 carefully selected math problems from high-quality sources, including AIME, CNMO, and MATH-500, and covers ten typologically and geographically diverse languages. Each problem is translated and validated through a rigorous pipeline that combines frontier LLMs with human verification, ensuring semantic consistency. | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/mmath?version=25.11) | `25.11` | mmath_en, mmath_zh, mmath_ar, mmath_es, mmath_fr, mmath_ja, mmath_ko, mmath_pt, mmath_th, mmath_vi |
+| **mtbench** | MT-bench is designed to test multi-turn conversation and instruction-following ability, covering common use cases and focusing on challenging questions to differentiate models. | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/mtbench?version=25.11) | `25.11` | mtbench, mtbench-cor1 |
+| **nemo_skills** | NeMo Skills - a project to improve skills of LLMs | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/nemo_skills?version=25.11) | `25.11` | ns_aime2024, ns_aime2025, ns_gpqa, ns_bfcl_v3, ns_bfcl_v4, ns_livecodebench, ns_hle, ns_ruler, ns_mmlu, ns_mmlu_pro, ns_scicode, ns_aa_lcr, ns_ifbench |
+| **profbench** | Professional domain benchmark for evaluating LLMs on Physics PhD, Chemistry PhD, Finance MBA, and Consulting MBA tasks | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/profbench?version=25.11) | `25.11` | report_generation, llm_judge |
+| **safety_eval** | Harness for Safety evaluations | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/safety-harness?version=25.11) | `25.11` | aegis_v2, aegis_v2_reasoning, wildguard |
+| **scicode** | SciCode is a challenging benchmark designed to evaluate the capabilities of LLMs in generating code for solving realistic scientific research problems. | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/scicode?version=25.11) | `25.11` | scicode, scicode_background, scicode_aa_v2 |
+| **simple_evals** | simple-evals - a lightweight library for evaluating language models. | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/simple-evals?version=25.11) | `25.11` | AIME_2025, AIME_2024, AA_AIME_2024, AA_math_test_500, math_test_500, mgsm, humaneval, humanevalplus, mmlu_pro, mmlu_am, mmlu_ar, mmlu_bn, mmlu_cs, mmlu_de, mmlu_el, mmlu_en, mmlu_es, mmlu_fa, mmlu_fil, mmlu_fr, mmlu_ha, mmlu_he, mmlu_hi, mmlu_id, mmlu_ig, mmlu_it, mmlu_ja, mmlu_ko, mmlu_ky, mmlu_lt, mmlu_mg, mmlu_ms, mmlu_ne, mmlu_nl, mmlu_ny, mmlu_pl, mmlu_pt, mmlu_ro, mmlu_ru, mmlu_si, mmlu_sn, mmlu_so, mmlu_sr, mmlu_sv, mmlu_sw, mmlu_te, mmlu_tr, mmlu_uk, mmlu_vi, mmlu_yo, mmlu_ar-lite, mmlu_bn-lite, mmlu_de-lite, mmlu_en-lite, mmlu_es-lite, mmlu_fr-lite, mmlu_hi-lite, mmlu_id-lite, mmlu_it-lite, mmlu_ja-lite, mmlu_ko-lite, mmlu_my-lite, mmlu_pt-lite, mmlu_sw-lite, mmlu_yo-lite, mmlu_zh-lite, mmlu, gpqa_diamond, gpqa_extended, gpqa_main, simpleqa, aime_2025_nemo, aime_2024_nemo, math_test_500_nemo, gpqa_diamond_nemo, gpqa_diamond_aa_v2_llama_4, gpqa_diamond_aa_v2, AIME_2025_aa_v2, mgsm_aa_v2, mmlu_pro_aa_v2, mmlu_llama_4, mmlu_pro_llama_4, healthbench, healthbench_consensus, healthbench_hard, browsecomp |
+| **tooltalk** | ToolTalk is designed to evaluate tool-augmented LLMs as a chatbot. ToolTalk contains a handcrafted dataset of 28 easy conversations and 50 hard conversations. | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/tooltalk?version=25.11) | `25.11` | tooltalk |
+| **vlmevalkit** | VLMEvalKit is an open-source evaluation toolkit of large vision-language models (LVLMs). It enables one-command evaluation of LVLMs on various benchmarks, without the heavy workload of data preparation under multiple repositories. In VLMEvalKit, we adopt generation-based evaluation for all LVLMs, and provide the evaluation results obtained with both exact matching and LLM-based answer extraction. | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/vlmevalkit?version=25.11) | `25.11` | ai2d_judge, chartqa, mathvista-mini, mmmu_judge, ocrbench, ocr_reasoning, slidevqa |
+-->
+<!-- END AUTOGENERATION -->
+
 ## 🚀 Quickstart
 
 Get your first evaluation result in minutes. This guide uses your local machine to run a small benchmark against an OpenAI API-compatible endpoint.