Merge pull request #780 from fsschneider/dev

priyakasimbeg · web-flow · commit 3b832f45e887 · 2024-08-29T10:14:12.000-07:00
Fix scoring bug, properly handeling `nan` values
diff --git a/.gitignore b/.gitignore
@@ -20,6 +20,7 @@ algorithmic_efficiency/workloads/librispeech_conformer/work_dir
 *.vocab
 wandb/
 *.txt
+scoring/plots/
 
 !scoring/test_data/experiment_dir/study_0/mnist_jax/trial_0/eval_measurements.csv
 !scoring/test_data/experiment_dir/study_0/mnist_jax/trial_1/eval_measurements.csv
diff --git a/scoring/performance_profile.py b/scoring/performance_profile.py
@@ -26,6 +26,7 @@
   the dictionary of submissions.
 """
 import itertools
+import json
 import operator
 import os
 import re
@@ -45,6 +46,10 @@
 BASE_WORKLOADS = workloads_registry.BASE_WORKLOADS
 WORKLOAD_NAME_PATTERN = '(.*)(_jax|_pytorch)'
 BASE_WORKLOADS_DIR = 'algorithmic_efficiency/workloads/'
+# Open json file to read heldout workloads
+# TODO: This probably shouldn't be hardcoded but passed as an argument.
+with open("held_out_workloads_algoperf_v05.json", "r") as f:
+  HELDOUT_WORKLOADS = json.load(f)
 # These global variables have to be set according to the current set of
 # workloads and rules for the scoring to be correct.
 # We do not use the workload registry since it contains test and development
@@ -248,6 +253,9 @@ def filter(x):
     try:
       if x[variant_workload] == np.inf:
         return np.inf
+      # Also check for nan values (e.g. OOMs)
+      elif np.isnan(x[variant_workload]):
+        return np.inf
       else:
         return x[base_workload]
     except KeyError as e:
@@ -306,8 +314,14 @@ def compute_performance_profiles(submissions,
                                      self_tuning_ruleset,
                                      strict))
   df = pd.concat(dfs)
-
-  # For each held-out workload set to inf if the base workload is inf
+  # Restrict to base and sampled held-out workloads
+  # (ignore the additional workload variants of the baseline
+  # as they cause issues when checking for nans in workload variants).
+  df = df[BASE_WORKLOADS + HELDOUT_WORKLOADS]
+  # Sort workloads alphabetically (for better display)
+  df = df.reindex(sorted(df.columns), axis=1)
+
+  # For each held-out workload set to inf if the base workload is inf or nan
   for workload in df.keys():
     if workload not in BASE_WORKLOADS:
       # If base do not have finite score set variant score to inf
@@ -319,14 +333,13 @@ def compute_performance_profiles(submissions,
   best_scores = df.min(axis=0)
   df[df.apply(lambda x: x > 4 * best_scores, axis=1)] = np.inf
 
-  # For each held-out workload if variant target was not hit set submission to inf
+  # For each base workload if variant target was not hit set submission to inf
   for workload in df.keys():
     if workload not in BASE_WORKLOADS:
       # If variants do not have finite score set base_workload score to inf
       base_workload = get_base_workload_name(workload)
       df[base_workload] = df.apply(
           variant_criteria_filter(base_workload, workload), axis=1)
-
   df = df[BASE_WORKLOADS]
 
   if verbosity > 0: