Unity-Technologies · harperj · Jul 10, 2020 · Jun 5, 2020 · Jun 9, 2020 · Jun 9, 2020
diff --git a/docs/Training-ML-Agents.md b/docs/Training-ML-Agents.md
@@ -96,8 +96,8 @@ in the `results/<run-identifier>` folder:
    blocks. See [Profiling in Python](Profiling-Python.md) for more information
    on the timers generated.
 
-These artifacts (except the `.nn` file) are updated throughout the training
-process and finalized when training completes or is interrupted.
+These artifacts are updated throughout the training
+process and finalized when training is completed or is interrupted.
 
 #### Stopping and Resuming Training
 

diff --git a/ml-agents/mlagents/model_serialization.py b/ml-agents/mlagents/model_serialization.py
@@ -60,6 +60,7 @@
 class SerializationSettings(NamedTuple):
     model_path: str
     brain_name: str
+    checkpoint_path: str = ""
     convert_to_barracuda: bool = True
     convert_to_onnx: bool = True
     onnx_opset: int = 9
@@ -72,15 +73,24 @@ def export_policy_model(
     Exports latest saved model to .nn format for Unity embedding.
     """
     frozen_graph_def = _make_frozen_graph(settings, graph, sess)
+    if not os.path.exists(settings.model_path):
+        os.makedirs(settings.model_path)
     # Save frozen graph
     frozen_graph_def_path = settings.model_path + "/frozen_graph_def.pb"
     with gfile.GFile(frozen_graph_def_path, "wb") as f:
         f.write(frozen_graph_def.SerializeToString())
 
     # Convert to barracuda
     if settings.convert_to_barracuda:
-        tf2bc.convert(frozen_graph_def_path, settings.model_path + ".nn")
-        logger.info(f"Exported {settings.model_path}.nn file")
+        if settings.checkpoint_path:
+            tf2bc.convert(
+                frozen_graph_def_path,
+                os.path.join(settings.model_path, f"{settings.checkpoint_path}.nn"),
+            )
+            logger.info(f"Exported {settings.checkpoint_path}.nn file")
+        else:
+            tf2bc.convert(frozen_graph_def_path, settings.model_path + ".nn")
+            logger.info(f"Exported {settings.model_path}.nn file")
 
     # Save to onnx too (if we were able to import it)
     if ONNX_EXPORT_ENABLED:

diff --git a/ml-agents/mlagents/trainers/ghost/trainer.py b/ml-agents/mlagents/trainers/ghost/trainer.py
@@ -117,7 +117,6 @@ def __init__(
         self.current_policy_snapshot: Dict[str, List[float]] = {}
 
         self.snapshot_counter: int = 0
-        self.policies: Dict[str, TFPolicy] = {}
 
         # wrapped_training_team and learning team need to be separate
         # in the situation where new agents are created destroyed
@@ -298,21 +297,11 @@ def end_episode(self):
         """
         self.trainer.end_episode()
 
-    def save_model(self, name_behavior_id: str) -> None:
+    def save_model(self) -> None:
         """
-        Forwarding call to wrapped trainers save_model
+        Forwarding call to wrapped trainers save_model.
         """
-        parsed_behavior_id = self._name_to_parsed_behavior_id[name_behavior_id]
-        brain_name = parsed_behavior_id.brain_name
-        self.trainer.save_model(brain_name)
-
-    def export_model(self, name_behavior_id: str) -> None:
-        """
-        Forwarding call to wrapped trainers export_model.
-        """
-        parsed_behavior_id = self._name_to_parsed_behavior_id[name_behavior_id]
-        brain_name = parsed_behavior_id.brain_name
-        self.trainer.export_model(brain_name)
+        self.trainer.save_model()
 
     def create_policy(
         self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters

diff --git a/ml-agents/mlagents/trainers/learn.py b/ml-agents/mlagents/trainers/learn.py
@@ -88,6 +88,7 @@ def run_training(run_seed: int, options: RunOptions) -> None:
             GlobalTrainingStatus.load_state(
                 os.path.join(run_logs_dir, "training_status.json")
             )
+
         # Configure CSV, Tensorboard Writers and StatsReporter
         # We assume reward and episode length are needed in the CSV.
         csv_writer = CSVWriter(

diff --git a/ml-agents/mlagents/trainers/policy/checkpoint_manager.py b/ml-agents/mlagents/trainers/policy/checkpoint_manager.py
@@ -0,0 +1,95 @@
+# # Unity ML-Agents Toolkit
+from typing import Dict, Any, Optional, List
+import os
+import attr
+from mlagents.trainers.training_status import GlobalTrainingStatus, StatusType
+from mlagents_envs.logging_util import get_logger
+
+logger = get_logger(__name__)
+
+
+@attr.s(auto_attribs=True)
+class Checkpoint:
+    steps: int
+    file_path: str
+    reward: Optional[float]
+    creation_time: float
+
+
+class CheckpointManager:
+    @staticmethod
+    def get_checkpoints(behavior_name: str) -> List[Dict[str, Any]]:
+        checkpoint_list = GlobalTrainingStatus.get_parameter_state(
+            behavior_name, StatusType.CHECKPOINTS
+        )
+        if not checkpoint_list:
+            checkpoint_list = []
+            GlobalTrainingStatus.set_parameter_state(
+                behavior_name, StatusType.CHECKPOINTS, checkpoint_list
+            )
+        return checkpoint_list
+
+    @staticmethod
+    def remove_checkpoint(checkpoint: Dict[str, Any]) -> None:
+        """
+        Removes a checkpoint stored in checkpoint_list.
+        If checkpoint cannot be found, no action is done.
+        :param checkpoint: A checkpoint stored in checkpoint_list
+        """
+        file_path: str = checkpoint["file_path"]
+        if os.path.exists(file_path):
+            os.remove(file_path)
+            logger.info(f"Removed checkpoint model {file_path}.")
+        else:
+            logger.info(f"Checkpoint at {file_path} could not be found.")
+        return
+
+    @classmethod
+    def manage_checkpoint_list(
+        cls, behavior_name: str, keep_checkpoints: int
+    ) -> List[Dict[str, Any]]:
+        """
+        Ensures that the number of checkpoints stored are within the number
+        of checkpoints the user defines. If the limit is hit, checkpoints are
+        removed to create room for the next checkpoint to be inserted.
+        :param category: The category (usually behavior name) of the parameter.
+        :param keep_checkpoints: Number of checkpoints to record (user-defined).
+        """
+        checkpoints = cls.get_checkpoints(behavior_name)
+        while len(checkpoints) >= keep_checkpoints:
+            if (keep_checkpoints <= 0) or (len(checkpoints) == 0):
+                break
+            CheckpointManager.remove_checkpoint(checkpoints.pop(0))
+        return checkpoints
+
+    @classmethod
+    def track_checkpoint_info(
+        cls, behavior_name: str, new_checkpoint: Checkpoint, keep_checkpoints: int
+    ) -> None:
+        """
+        Make room for new checkpoint if needed and insert new checkpoint information.
+        :param category: The category (usually behavior name) of the parameter.
+        :param value: The new checkpoint to be recorded.
+        :param keep_checkpoints: Number of checkpoints to record (user-defined).
+        """
+        checkpoints = cls.manage_checkpoint_list(behavior_name, keep_checkpoints)
+        new_checkpoint_dict = attr.asdict(new_checkpoint)
+        checkpoints.append(new_checkpoint_dict)
+
+    @classmethod
+    def track_final_model_info(
+        cls, behavior_name: str, final_model: Checkpoint, keep_checkpoints: int
+    ) -> None:
+        """
+        Ensures number of checkpoints stored is within the max number of checkpoints
+        defined by the user and finally stores the information about the final
+        model (or intermediate model if training is interrupted).
+        :param category: The category (usually behavior name) of the parameter.
+        :param final_model_path: The file path of the final model.
+        :param keep_checkpoints: Number of checkpoints to record (user-defined).
+        """
+        CheckpointManager.manage_checkpoint_list(behavior_name, keep_checkpoints)
+        final_model_dict = attr.asdict(final_model)
+        GlobalTrainingStatus.set_parameter_state(
+            behavior_name, StatusType.FINAL_MODEL, final_model_dict
+        )
diff --git a/ml-agents/mlagents/trainers/policy/tf_policy.py b/ml-agents/mlagents/trainers/policy/tf_policy.py
@@ -2,10 +2,14 @@
 import abc
 import os
 import numpy as np
+import time
 from distutils.version import LooseVersion
 
+from mlagents.model_serialization import SerializationSettings, export_policy_model
 from mlagents.tf_utils import tf
 from mlagents import tf_utils
+from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
+from mlagents.trainers.policy.checkpoint_manager import Checkpoint, CheckpointManager
 from mlagents_envs.exception import UnityException
 from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.policy import Policy
@@ -70,6 +74,9 @@ def __init__(
         self.sequence_length = 1
         self.seed = seed
         self.brain = brain
+        self.behavior_id = BehaviorIdentifiers.from_name_behavior_id(
+            self.brain.brain_name
+        )
 
         self.act_size = brain.vector_action_space_size
         self.vec_obs_size = brain.vector_observation_space_size
@@ -392,18 +399,52 @@ def get_update_vars(self):
         """
         return list(self.update_dict.keys())
 
-    def save_model(self, steps):
+    def checkpoint(self, model_reward: Optional[float] = None) -> None:
         """
-        Saves the model
-        :param steps: The number of steps the model was trained for
-        :return:
+        Writes an intermediate checkpoint model to memory
+        model_reward: Mean reward of the reward buffer at the time of saving
         """
+        current_step = self.get_current_step()
         with self.graph.as_default():
-            last_checkpoint = os.path.join(self.model_path, f"model-{steps}.ckpt")
-            self.saver.save(self.sess, last_checkpoint)
+            last_checkpoint = os.path.join(
+                self.model_path, f"model-{current_step}.ckpt"
+            )
+            if self.saver:
+                self.saver.save(self.sess, last_checkpoint)
             tf.train.write_graph(
                 self.graph, self.model_path, "raw_graph_def.pb", as_text=False
             )
+        brain_name = self.behavior_id.brain_name
+        checkpoint_path = f"{brain_name}-{current_step}"
+        settings = SerializationSettings(self.model_path, brain_name, checkpoint_path)
+        export_policy_model(settings, self.graph, self.sess)
+        # Store steps and file_path
+        new_checkpoint = Checkpoint(
+            int(current_step),
+            os.path.join(self.model_path, f"{settings.checkpoint_path}.nn"),
+            model_reward,
+            time.time(),
+        )
+        # Record checkpoint information
+        CheckpointManager.track_checkpoint_info(
+            brain_name, new_checkpoint, self.keep_checkpoints
+        )
+
+    def save(self, model_reward: Optional[float] = None) -> None:
+        """
+        Saves the final model on completion or interruption
+        model_reward: Mean reward of the reward buffer at the time of saving
+        """
+        current_step = self.get_current_step()
+        brain_name = self.behavior_id.brain_name
+        settings = SerializationSettings(self.model_path, brain_name)
+        final_model = Checkpoint(
+            int(current_step), f"{settings.model_path}.nn", model_reward, time.time()
+        )
+        CheckpointManager.track_final_model_info(
+            brain_name, final_model, self.keep_checkpoints
+        )
+        export_policy_model(settings, self.graph, self.sess)
 
     def update_normalization(self, vector_obs: np.ndarray) -> None:
         """

diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -229,6 +229,7 @@ def add_policy(
         if not isinstance(policy, NNPolicy):
             raise RuntimeError("Non-NNPolicy passed to PPOTrainer.add_policy()")
         self.policy = policy
+        self.policies[parsed_behavior_id.behavior_id] = policy
         self.optimizer = PPOOptimizer(self.policy, self.trainer_settings)
         for _reward_signal in self.optimizer.reward_signals.keys():
             self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)

diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py
@@ -76,12 +76,21 @@ def __init__(
 
         self.checkpoint_replay_buffer = self.hyperparameters.save_replay_buffer
 
-    def save_model(self, name_behavior_id: str) -> None:
+    def _checkpoint(self) -> None:
         """
-        Saves the model. Overrides the default save_model since we want to save
-        the replay buffer as well.
+        Writes a checkpoint model to memory
+        Overrides the default to save the replay buffer.
         """
-        self.policy.save_model(self.get_step)
+        super()._checkpoint()
+        if self.checkpoint_replay_buffer:
+            self.save_replay_buffer()
+
+    def save_model(self) -> None:
+        """
+        Saves the final training model to memory
+        Overrides the default to save the replay buffer.
+        """
+        super().save_model()
         if self.checkpoint_replay_buffer:
             self.save_replay_buffer()
 
@@ -308,7 +317,6 @@ def add_policy(
     ) -> None:
         """
         Adds policy to trainer.
-        :param brain_parameters: specifications for policy construction
         """
         if self.policy:
             logger.warning(
@@ -320,6 +328,7 @@ def add_policy(
         if not isinstance(policy, NNPolicy):
             raise RuntimeError("Non-SACPolicy passed to SACTrainer.add_policy()")
         self.policy = policy
+        self.policies[parsed_behavior_id.behavior_id] = policy
         self.optimizer = SACOptimizer(self.policy, self.trainer_settings)
         for _reward_signal in self.optimizer.reward_signals.keys():
             self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)

diff --git a/ml-agents/mlagents/trainers/tests/test_barracuda_converter.py b/ml-agents/mlagents/trainers/tests/test_barracuda_converter.py
@@ -44,7 +44,7 @@ def test_policy_conversion(tmpdir, rnn, visual, discrete):
         use_discrete=discrete,
         use_visual=visual,
     )
-    policy.save_model(1000)
+    policy.checkpoint()
     settings = SerializationSettings(
         policy.model_path, os.path.join(tmpdir, policy.brain.brain_name)
     )

diff --git a/ml-agents/mlagents/trainers/tests/test_config_conversion.py b/ml-agents/mlagents/trainers/tests/test_config_conversion.py
@@ -132,11 +132,11 @@ def test_convert_behaviors(trainer_type, use_recurrent):
     if trainer_type == TrainerType.PPO:
         trainer_config = PPO_CONFIG
         trainer_settings_type = PPOSettings
-    elif trainer_type == TrainerType.SAC:
+    else:
         trainer_config = SAC_CONFIG
         trainer_settings_type = SACSettings
 
-    old_config = yaml.load(trainer_config)
+    old_config = yaml.safe_load(trainer_config)
     old_config[BRAIN_NAME]["use_recurrent"] = use_recurrent
     new_config = convert_behaviors(old_config)
 

diff --git a/ml-agents/mlagents/trainers/tests/test_nn_policy.py b/ml-agents/mlagents/trainers/tests/test_nn_policy.py
@@ -57,7 +57,7 @@ def test_load_save(tmp_path):
     policy = create_policy_mock(trainer_params, model_path=path1)
     policy.initialize_or_load()
     policy._set_step(2000)
-    policy.save_model(2000)
+    policy.checkpoint()
 
     assert len(os.listdir(tmp_path)) > 0