Skip to content

Commit ccc06f3

Browse files
authored
chore(containers): speed up verifier/updater (of IRs) and add multiarch info (#577)
- adds to the harness and task IR the arch of the container - uses it in the output of `ls tasks` (also from any container `--from URL`) - uses it in the docs (for table https://docs-nemo-evaluator.nvidia.com/agronskiy/chore/containers/speed-up-verification-and-multiarch/__delim__/evaluation/benchmarks/catalog/index.html) and for task (https://docs-nemo-evaluator.nvidia.com/agronskiy/chore/containers/speed-up-verification-and-multiarch/__delim__/evaluation/benchmarks/catalog/all/harnesses/bigcode-evaluation-harness.html#bigcode-evaluation-harness-multiple-cpp) --------- Signed-off-by: Alex Gronskiy <[email protected]>
1 parent 359b97d commit ccc06f3

File tree

13 files changed

+888
-187
lines changed

13 files changed

+888
-187
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ NeMo Evaluator Launcher provides pre-built evaluation containers for different e
5959
| **vlmevalkit** | Vision-language model evaluation | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/vlmevalkit) | `25.11` | AI2D, ChartQA, MMMU, MathVista-MINI, OCRBench, SlideVQA |
6060

6161
<!-- BEGIN AUTOGENERATION -->
62-
<!-- mapping toml checksum: sha256:b7fdaa7f01a641970f864c6aab95d7f9e49b883dee8558e8636eb8018a01388e -->
62+
<!-- mapping toml checksum: sha256:881e6d1de31824c9e77a3e13c0a9ab988d6bab7cc9fab5b298ef1e5b1bdf1af9 -->
6363
<!--
6464
| Container | Description | NGC Catalog | Latest Tag | Supported benchmarks |
6565
|-----------|-------------|-------------|------------| ------------|

packages/nemo-evaluator-launcher/scripts/autogen_task_yamls.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ def generate_yaml(self) -> dict:
115115
"harness": self.task_ir.harness,
116116
"container": self.task_ir.container,
117117
"container_digest": self.task_ir.container_digest,
118+
"container_arch": getattr(self.task_ir, "container_arch", None),
118119
"defaults": self.task_ir.defaults,
119120
}
120121

@@ -190,6 +191,9 @@ def generate_markdown_section(self, harness_id: str) -> list[str]:
190191
lines.append(str(self.task_ir.container_digest))
191192
lines.append("```")
192193
lines.append("")
194+
container_arch = getattr(self.task_ir, "container_arch", None) or "unknown"
195+
lines.append(f"**Container Arch:** `{container_arch}`")
196+
lines.append("")
193197
if task_type:
194198
lines.append(f"**Task Type:** `{task_type}`")
195199
lines.append("")
@@ -522,12 +526,13 @@ def generate_benchmarks_table_markdown(
522526

523527
lines.append("```{list-table}")
524528
lines.append(":header-rows: 1")
525-
lines.append(":widths: 20 25 15 15 25")
529+
lines.append(":widths: 18 24 14 12 8 24")
526530
lines.append("")
527531
lines.append("* - Container")
528532
lines.append(" - Description")
529533
lines.append(" - NGC Catalog")
530534
lines.append(" - Latest Tag")
535+
lines.append(" - Arch")
531536
lines.append(" - Tasks")
532537

533538
# Sort harnesses alphabetically for consistent ordering
@@ -579,6 +584,12 @@ def generate_benchmarks_table_markdown(
579584
# If no version found, use placeholder as fallback
580585
latest_tag = version if version else "{{ docker_compose_latest }}"
581586

587+
arch = (
588+
harness.harness_ir.arch
589+
or (harness.tasks[0].container_arch if harness.tasks else None)
590+
or "unknown"
591+
)
592+
582593
# Escape special characters in markdown (but preserve links)
583594
# Some harnesses may store description as non-string types (e.g., list).
584595
if isinstance(description, list):
@@ -591,6 +602,7 @@ def generate_benchmarks_table_markdown(
591602
lines.append(f" - {description_display}")
592603
lines.append(f" - {ngc_link}")
593604
lines.append(f" - {latest_tag}")
605+
lines.append(f" - `{arch}`")
594606
lines.append(f" - {tasks_display}")
595607

596608
lines.append("```")
@@ -650,11 +662,12 @@ def generate_benchmarks_table_internal_markdown(
650662

651663
lines.append("```{list-table}")
652664
lines.append(":header-rows: 1")
653-
lines.append(":widths: 20 30 25 25")
665+
lines.append(":widths: 18 30 18 8 26")
654666
lines.append("")
655667
lines.append("* - Container")
656668
lines.append(" - Description")
657669
lines.append(" - Container Ref")
670+
lines.append(" - Arch")
658671
lines.append(" - Tasks")
659672

660673
sorted_harnesses = sorted(harnesses, key=lambda h: h.harness_name.lower())
@@ -693,10 +706,16 @@ def generate_benchmarks_table_internal_markdown(
693706
description_display = description_text.replace("|", "\\|").replace("\n", " ")
694707

695708
container_ref_display = f"`{container_ref}`" if container_ref else "N/A"
709+
arch = (
710+
harness.harness_ir.arch
711+
or (harness.tasks[0].container_arch if harness.tasks else None)
712+
or "unknown"
713+
)
696714

697715
lines.append(f"* - {container_display}")
698716
lines.append(f" - {description_display}")
699717
lines.append(f" - {container_ref_display}")
718+
lines.append(f" - `{arch}`")
700719
lines.append(f" - {tasks_display}")
701720

702721
lines.append("```")
@@ -903,6 +922,7 @@ def main():
903922
url=None,
904923
container=container,
905924
container_digest=container_digest,
925+
arch=None,
906926
)
907927
harnesses.append(_HarnessAutogen(harness_ir, tasks))
908928

packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/api/functional.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def get_tasks_list() -> list[list[Any]]:
3636
"""Get a list of available tasks from the mapping.
3737
3838
Returns:
39-
list[list[Any]]: Each sublist contains task name, endpoint type, harness, container, description, and type.
39+
list[list[Any]]: Each sublist contains task name, endpoint type, harness, container, arch, description, and type.
4040
"""
4141
mapping = load_tasks_mapping()
4242
data = [
@@ -45,6 +45,7 @@ def get_tasks_list() -> list[list[Any]]:
4545
task_data.get("endpoint_type"),
4646
task_data.get("harness"),
4747
task_data.get("container"),
48+
task_data.get("arch", ""),
4849
task_data.get("description", ""),
4950
task_data.get("type", ""),
5051
]

packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/cli/ls_tasks.py

Lines changed: 100 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,6 @@ def execute(self) -> None:
9595
if isinstance(endpoint_types, str):
9696
endpoint_types = [endpoint_types]
9797

98-
task_type = task.defaults.get("config", {}).get("type", "")
99-
10098
data.append(
10199
[
102100
task.name, # task
@@ -105,8 +103,8 @@ def execute(self) -> None:
105103
else endpoint_types, # endpoint_type
106104
task.harness, # harness
107105
task.container, # container
106+
getattr(task, "container_arch", "") or "", # arch
108107
task.description, # description
109-
task_type, # type
110108
]
111109
)
112110
else:
@@ -121,13 +119,17 @@ def execute(self) -> None:
121119
"endpoint_type",
122120
"harness",
123121
"container",
122+
"arch",
124123
"description",
125-
"type",
126124
]
127125
supported_benchmarks = []
128126
for task_data in data:
129-
assert len(task_data) == len(headers)
130-
supported_benchmarks.append(dict(zip(headers, task_data)))
127+
if len(task_data) < len(headers):
128+
raise ValueError(
129+
f"Invalid task row shape: expected at least {len(headers)} columns, got {len(task_data)}"
130+
)
131+
# Backwards/forwards compat: allow extra columns and ignore them.
132+
supported_benchmarks.append(dict(zip(headers, task_data[: len(headers)])))
131133

132134
if self.json:
133135
print(json.dumps({"tasks": supported_benchmarks}, indent=2))
@@ -140,6 +142,50 @@ def _print_table(self, tasks: list[dict]) -> None:
140142
print("No tasks found.")
141143
return
142144

145+
def _truncate(s: str, max_len: int) -> str:
146+
s = s or ""
147+
if max_len <= 0:
148+
return ""
149+
if len(s) <= max_len:
150+
return s
151+
if max_len <= 3:
152+
return s[:max_len]
153+
return s[: max_len - 3] + "..."
154+
155+
def _infer_arch(container: str, container_tasks: list[dict]) -> str:
156+
# Prefer explicit arch from task IRs.
157+
for t in container_tasks:
158+
a = (t.get("arch") or "").strip()
159+
if a:
160+
return a
161+
162+
# Heuristic fallback: look for common suffixes in tag.
163+
c = (container or "").lower()
164+
if "arm64" in c or "aarch64" in c:
165+
return "arm"
166+
if "amd64" in c or "x86_64" in c:
167+
return "amd"
168+
return "unknown"
169+
170+
def _infer_registry(container: str) -> str:
171+
try:
172+
from nemo_evaluator_launcher.common.container_metadata.utils import (
173+
parse_container_image,
174+
)
175+
176+
registry_type, _registry_url, _repo, _ref = parse_container_image(
177+
container
178+
)
179+
return str(registry_type)
180+
except Exception:
181+
# Best-effort fallback for unknown formats.
182+
c = (container or "").lower()
183+
if "nvcr.io/" in c or c.startswith("nvcr.io"):
184+
return "nvcr"
185+
if "gitlab" in c:
186+
return "gitlab"
187+
return ""
188+
143189
# Group tasks by harness and container
144190
grouped = defaultdict(lambda: defaultdict(list))
145191
for task in tasks:
@@ -156,73 +202,82 @@ def _print_table(self, tasks: list[dict]) -> None:
156202
if j > 0:
157203
print() # Spacing between containers
158204

159-
# Prepare task table first to get column widths
160-
task_header = "task"
161205
rows = []
162206
for task in container_tasks:
163-
task_name = task["task"]
164-
endpoint_type = task["endpoint_type"]
165-
task_type = task.get("type", "")
166-
description = task.get("description", "")
167-
# Format: task_name (endpoint_type, task_type) - first 30 chars...
168-
description_preview = description[:30] if description else ""
169-
if len(description) > 30:
170-
description_preview += "..."
171-
172-
# Build the display name
173-
type_part = f"{endpoint_type}"
174-
if task_type:
175-
type_part += f", {task_type}"
176-
display_name = f"{task_name} ({type_part})"
177-
if description_preview:
178-
display_name = f"{display_name} - {description_preview}"
179-
rows.append(display_name)
180-
181-
# Sort tasks alphabetically for better readability
182-
rows.sort()
183-
184-
# Calculate column width
185-
max_task_width = (
186-
max(len(task_header), max(len(str(row)) for row in rows)) + 2
187-
)
207+
rows.append(
208+
{
209+
"task": str(task.get("task", "")),
210+
"endpoint": str(task.get("endpoint_type", "")),
211+
"description": str(task.get("description", "")),
212+
}
213+
)
214+
rows.sort(key=lambda r: r["task"].lower())
188215

189216
# Calculate required width for header content
190217
harness_line = f"harness: {harness}"
191218
container_line = f"container: {container}"
219+
arch_line = f"arch: {_infer_arch(container, container_tasks)}"
220+
registry_line = f"registry: {_infer_registry(container)}"
192221
header_content_width = (
193-
max(len(harness_line), len(container_line)) + 4
222+
max(
223+
len(harness_line),
224+
len(container_line),
225+
len(arch_line),
226+
len(registry_line),
227+
)
228+
+ 4
194229
) # +4 for "| " and " |"
195230

196-
# Use the larger of the two widths
197-
table_width = max(max_task_width, header_content_width)
198-
199231
# Limit separator width to prevent overflow on small terminals
200232
# Use terminal width if available, otherwise cap at 120 characters
201233
import shutil
202234

203235
try:
204236
terminal_width = shutil.get_terminal_size().columns
205-
separator_width = min(
206-
table_width, terminal_width - 2
207-
) # -2 for safety margin
237+
separator_width = min(terminal_width - 2, 160) # -2 safety margin
208238
except Exception:
209239
# Fallback if terminal size can't be determined
210-
separator_width = min(table_width, 120)
240+
separator_width = 120
241+
242+
separator_width = max(separator_width, min(header_content_width, 160))
243+
244+
# Table columns (keep compact and stable).
245+
col_task = 36
246+
col_endpoint = 14
247+
sep = " "
248+
fixed = col_task + col_endpoint + len(sep) * 2
249+
col_desc = max(20, separator_width - fixed)
211250

212251
# Print combined header with harness and container info - colorized
213252
# Keys: magenta, Values: cyan (matching logging utils)
214253
print(bold("=" * separator_width))
215254
print(f"{magenta('harness:')} {cyan(str(harness))}")
216255
print(f"{magenta('container:')} {cyan(str(container))}")
256+
arch = _infer_arch(container, container_tasks)
257+
registry = _infer_registry(container)
258+
print(f"{magenta('arch:')} {cyan(str(arch))}")
259+
if registry:
260+
print(f"{magenta('registry:')} {cyan(str(registry))}")
217261

218262
# Print task table header separator
219-
print(" " * table_width)
220-
print(bold(f"{task_header:<{table_width}}"))
263+
print()
264+
print(
265+
bold(
266+
f"{'task':<{col_task}}{sep}"
267+
f"{'endpoint':<{col_endpoint}}{sep}"
268+
f"{'description':<{col_desc}}"
269+
)
270+
)
221271
print(bold("-" * separator_width))
222272

223273
# Print task rows - use grey for task descriptions
224-
for row in rows:
225-
print(f"{grey(str(row)):<{table_width}}")
274+
for r in rows:
275+
line = (
276+
f"{_truncate(r['task'], col_task):<{col_task}}{sep}"
277+
f"{_truncate(r['endpoint'], col_endpoint):<{col_endpoint}}{sep}"
278+
f"{_truncate(r['description'], col_desc):<{col_desc}}"
279+
)
280+
print(grey(line))
226281

227282
print(bold("-" * separator_width))
228283
# Show task count - grey for count text

0 commit comments

Comments
 (0)