Move poll_trial_status and poll_available_capacity to TorchXRunner, remove TorchXScheduler (#361)

aivanou · facebook-github-bot · commit f5278cca833e · 2022-02-21T13:24:59.000-08:00
Summary: Pull Request resolved: #361 Adjusting the TorchX setup following D31031589 and D31032567 Deprecate `torchx.runtime.hpo.ax.TorchXScheduler` Reviewed By: lena-kashtelyan Differential Revision: D33062113 fbshipit-source-id: 3166c11f6392c52039f1f08140e2d831772bba06
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -1,5 +1,5 @@
 aiobotocore==2.1.0
-ax-platform[mysql]==0.2.2
+ax-platform[mysql]==0.2.3
 black==21.10b0
 boto3==1.20.24
 captum>=0.4.0
diff --git a/docs/source/runtime/hpo.rst b/docs/source/runtime/hpo.rst
@@ -14,5 +14,4 @@ Ax (Adaptive Experimentation)
 .. currentmodule:: torchx.runtime.hpo.ax
 
 .. autoclass:: TorchXRunner
-.. autoclass:: TorchXScheduler
-.. autoclass:: AppMetric
+.. autoclass:: AppMetric
diff --git a/torchx/cli/test/cmd_run_test.py b/torchx/cli/test/cmd_run_test.py
@@ -45,7 +45,7 @@ def setUp(self) -> None:
         self.cmd_run.add_arguments(self.parser)
 
     def tearDown(self) -> None:
-        shutil.rmtree(self.tmpdir)
+        shutil.rmtree(self.tmpdir, ignore_errors=True)
 
     def test_run_with_user_conf_abs_path(self) -> None:
         args = self.parser.parse_args(
diff --git a/torchx/runtime/hpo/__init__.py b/torchx/runtime/hpo/__init__.py
@@ -54,13 +54,13 @@
     SearchSpace,
  )
  from ax.modelbridge.dispatch_utils import choose_generation_strategy
- from ax.service.scheduler import SchedulerOptions
+ from ax.service.scheduler import SchedulerOptions, Scheduler
  from ax.service.utils.best_point import get_best_parameters
  from ax.service.utils.report_utils import exp_to_df
  from ax.utils.common.constants import Keys
  from pyre_extensions import none_throws
  from torchx.components import utils
- from torchx.runtime.hpo.ax import AppMetric, TorchXRunner, TorchXScheduler
+ from torchx.runtime.hpo.ax import AppMetric, TorchXRunner
 
  # Run HPO on the booth function (https://en.wikipedia.org/wiki/Test_functions_for_optimization)
 
@@ -100,7 +100,7 @@
      properties={Keys.IMMUTABLE_SEARCH_SPACE_AND_OPT_CONF: True},
  )
 
- scheduler = TorchXScheduler(
+ scheduler = Scheduler(
      experiment=experiment,
      generation_strategy=(
          choose_generation_strategy(
diff --git a/torchx/runtime/hpo/ax.py b/torchx/runtime/hpo/ax.py
@@ -6,16 +6,15 @@
 # LICENSE file in the root directory of this source tree.
 
 import inspect
-from typing import Iterable, Any, Callable, Dict, Mapping, Optional, Set, cast
+from typing import Any, Callable, Dict, Mapping, Optional, Set, cast, Iterable
 
 import pandas as pd
 from ax.core import Trial
 from ax.core.base_trial import BaseTrial
 from ax.core.data import Data
 from ax.core.metric import Metric
 from ax.core.runner import Runner as ax_Runner
-from ax.service.scheduler import Scheduler as ax_Scheduler, TrialStatus
-from ax.utils.common.typeutils import not_none
+from ax.service.scheduler import TrialStatus
 from pyre_extensions import none_throws
 from torchx.runner import Runner, get_runner
 from torchx.runtime.tracking import FsspecResultTracker
@@ -209,14 +208,14 @@ def run(self, trial: BaseTrial) -> Dict[str, Any]:
     def poll_trial_status(
         self, trials: Iterable[BaseTrial]
     ) -> Dict[TrialStatus, Set[int]]:
-        """Returns the statuses of the given trials."""
         trial_statuses: Dict[TrialStatus, Set[int]] = {}
 
         for trial in trials:
             app_handle: str = trial.run_metadata[_TORCHX_APP_HANDLE]
-            app_status: Optional[AppStatus] = self._torchx_runner.status(app_handle)
-            assert app_status is not None
+            torchx_runner = trial.run_metadata[_TORCHX_RUNNER]
+            app_status: AppStatus = torchx_runner.status(app_handle)
             trial_status = APP_STATE_TO_TRIAL_STATUS[app_status.state]
+
             indices = trial_statuses.setdefault(trial_status, set())
             indices.add(trial.index)
 
@@ -227,43 +226,3 @@ def stop(self, trial: BaseTrial, reason: Optional[str] = None) -> Dict[str, Any]
         app_handle: str = trial.run_metadata[_TORCHX_APP_HANDLE]
         self._torchx_runner.stop(app_handle)
         return {"reason": reason} if reason else {}
-
-
-class TorchXScheduler(ax_Scheduler):
-    """
-    An implementation of an `Ax Scheduler <https://ax.dev/tutorials/scheduler.html>`_
-    that works with Experiments hooked up with the ``TorchXRunner``.
-
-    This scheduler is not a real scheduler but rather a facade scheduler
-    that delegates to scheduler clients for various remote/local schedulers.
-    For a list of supported schedulers please refer to TorchX
-    `scheduler docs <https://pytorch.org/torchx/latest/schedulers.html>`_.
-
-    """
-
-    def poll_trial_status(
-        self, poll_all_trial_statuses: bool = False
-    ) -> Dict[TrialStatus, Set[int]]:
-        return cast(TorchXRunner, self.experiment.runner).poll_trial_status(
-            self.running_trials
-        )
-
-    def poll_available_capacity(self) -> int:
-        """
-        Used when ``run_trials_in_batches`` option is set.
-        Since this scheduler is a faux scheduler, this method
-        always returns the ``max_parallelism`` of the current
-        step of this scheduler's ``generation_strategy``.
-
-        .. note:: The trials (jobs) are simply submitted to the
-                  scheduler in parallel. Typically the trials will be
-                  queued in the scheduler's job queue (on the server-side)
-                  and executed according to the scheduler's job priority
-                  and scheduling policies.
-
-        """
-        return (
-            -1
-            if self.generation_strategy._curr.max_parallelism is None
-            else not_none(self.generation_strategy._curr.max_parallelism)
-        )
diff --git a/torchx/runtime/hpo/test/ax_test.py b/torchx/runtime/hpo/test/ax_test.py
@@ -22,14 +22,14 @@
     SearchSpace,
 )
 from ax.modelbridge.dispatch_utils import choose_generation_strategy
-from ax.service.scheduler import SchedulerOptions
+from ax.service.scheduler import SchedulerOptions, Scheduler
 from ax.service.utils.report_utils import exp_to_df
 from ax.utils.common.constants import Keys
 from torchx.components import utils
-from torchx.runtime.hpo.ax import AppMetric, TorchXRunner, TorchXScheduler
+from torchx.runtime.hpo.ax import AppMetric, TorchXRunner
 
 
-class TorchXSchedulerTest(unittest.TestCase):
+class TorchXAxTest(unittest.TestCase):
     def setUp(self) -> None:
         self.test_dir = tempfile.mkdtemp("torchx_runtime_hpo_ax_test")
 
@@ -84,7 +84,7 @@ def test_run_experiment_locally(self) -> None:
 
         # maybe add-on cfg into SchedulerOption?
         # so that we can pass it from one place
-        scheduler = TorchXScheduler(
+        scheduler = Scheduler(
             experiment=experiment,
             generation_strategy=(
                 choose_generation_strategy(
@@ -114,7 +114,7 @@ def test_stop_trials(self) -> None:
             is_test=True,
             properties={Keys.IMMUTABLE_SEARCH_SPACE_AND_OPT_CONF: True},
         )
-        scheduler = TorchXScheduler(
+        scheduler = Scheduler(
             experiment=experiment,
             generation_strategy=(
                 choose_generation_strategy(
@@ -152,7 +152,7 @@ def test_run_experiment_locally_in_batches(self) -> None:
 
         # maybe add-on cfg into SchedulerOption?
         # so that we can pass it from one place
-        scheduler = TorchXScheduler(
+        scheduler = Scheduler(
             experiment=experiment,
             generation_strategy=(
                 choose_generation_strategy(