Unity-Technologies · vincentpierre · May 1, 2020 · Apr 30, 2020 · May 1, 2020
diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md
@@ -10,6 +10,7 @@ and this project adheres to
 ### Major Changes
 #### com.unity.ml-agents (C#)
 #### ml-agents / ml-agents-envs / gym-unity (Python)
+- `max_step` in the `TerminalStep` and `TerminalSteps` objects was renamed `interrupted`.
 ### Minor Changes
 #### com.unity.ml-agents (C#)
 #### ml-agents / ml-agents-envs / gym-unity (Python)

diff --git a/docs/Migrating.md b/docs/Migrating.md
@@ -21,6 +21,7 @@ double-check that the versions are in the same. The versions can be found in
   instead of `summaries/` and `models/`.
 - Trainer configuration, curriculum configuration, and parameter randomization
   configuration have all been moved to a single YAML file. (#3791)
+- `max_step` in the `TerminalStep` and `TerminalSteps` objects was renamed `interrupted`.
 
 ### Steps to Migrate
 - Before upgrading, copy your `Behavior Name` sections from `trainer_config.yaml` into
@@ -31,6 +32,8 @@ double-check that the versions are in the same. The versions can be found in
   the `Behavior Name` section.
   - If your training uses [parameter randomization](Training-ML-Agents.md#environment-parameter-randomization), move
   the contents of the sampler config to `parameter_randomization` in the main trainer configuration.
+- If you are using `UnityEnvironment` directly, replace `max_step` with `interrupted`
+in the `TerminalStep` and `TerminalSteps` objects.
 
 ## Migrating from 0.15 to Release 1
 

diff --git a/docs/Python-API.md b/docs/Python-API.md
@@ -200,9 +200,9 @@ A `TerminalSteps` has the following fields :
 - `agent_id` is an int vector of length batch size containing unique identifier
   for the corresponding Agent. This is used to track Agents across simulation
   steps.
-- `max_step` is an array of booleans of length batch size. Is true if the
-  associated Agent reached its maximum number of steps during the last
-  simulation step.
+ - `interrupted` is an array of booleans of length batch size. Is true if the
+ associated Agent was interrupted since the last decision step. For example,
+ if the Agent reached the maximum number of steps for the episode.
 
 It also has the two following methods:
 
@@ -218,8 +218,9 @@ A `TerminalStep` has the following fields:
 - `reward` is a float. Corresponds to the rewards collected by the agent since
   the last simulation step.
 - `agent_id` is an int and an unique identifier for the corresponding Agent.
-- `max_step` is a bool. Is true if the Agent reached its maximum number of steps
-  during the last simulation step.
+ - `interrupted` is a bool. Is true if the Agent was interrupted since the last
+ decision step. For example, if the Agent reached the maximum number of steps for
+ the episode.
 
 #### BehaviorSpec
 

diff --git a/ml-agents-envs/mlagents_envs/base_env.py b/ml-agents-envs/mlagents_envs/base_env.py
@@ -144,14 +144,15 @@ class TerminalStep(NamedTuple):
      - obs is a list of numpy arrays observations collected by the agent.
      - reward is a float. Corresponds to the rewards collected by the agent
      since the last simulation step.
-     - max_step is a bool. Is true if the Agent reached its maximum number of
-     steps during the last simulation step.
+     - interrupted is a bool. Is true if the Agent was interrupted since the last
+     decision step. For example, if the Agent reached the maximum number of steps for
+     the episode.
      - agent_id is an int and an unique identifier for the corresponding Agent.
     """
 
     obs: List[np.ndarray]
     reward: float
-    max_step: bool
+    interrupted: bool
     agent_id: AgentId
 
 
@@ -165,18 +166,18 @@ class TerminalSteps(Mapping):
      first dimension of the array corresponds to the batch size of the batch.
      - reward is a float vector of length batch size. Corresponds to the
      rewards collected by each agent since the last simulation step.
-     - max_step is an array of booleans of length batch size. Is true if the
-     associated Agent reached its maximum number of steps during the last
-     simulation step.
+     - interrupted is an array of booleans of length batch size. Is true if the
+     associated Agent was interrupted since the last decision step. For example, if the
+     Agent reached the maximum number of steps for the episode.
      - agent_id is an int vector of length batch size containing unique
      identifier for the corresponding Agent. This is used to track Agents
      across simulation steps.
     """
 
-    def __init__(self, obs, reward, max_step, agent_id):
+    def __init__(self, obs, reward, interrupted, agent_id):
         self.obs: List[np.ndarray] = obs
         self.reward: np.ndarray = reward
-        self.max_step: np.ndarray = max_step
+        self.interrupted: np.ndarray = interrupted
         self.agent_id: np.ndarray = agent_id
         self._agent_id_to_index: Optional[Dict[AgentId, int]] = None
 
@@ -213,7 +214,7 @@ def __getitem__(self, agent_id: AgentId) -> TerminalStep:
         return TerminalStep(
             obs=agent_obs,
             reward=self.reward[agent_index],
-            max_step=self.max_step[agent_index],
+            interrupted=self.interrupted[agent_index],
             agent_id=agent_id,
         )
 
@@ -232,7 +233,7 @@ def empty(spec: "BehaviorSpec") -> "TerminalSteps":
         return TerminalSteps(
             obs=obs,
             reward=np.zeros(0, dtype=np.float32),
-            max_step=np.zeros(0, dtype=np.bool),
+            interrupted=np.zeros(0, dtype=np.bool),
             agent_id=np.zeros(0, dtype=np.int32),
         )
 
@@ -381,7 +382,7 @@ def get_steps(
          the rewards, the agent ids and the action masks for the Agents
          of the specified behavior. These Agents need an action this step.
          - A TerminalSteps NamedTuple containing the observations,
-         rewards, agent ids and max_step flags of the agents that had their
+         rewards, agent ids and interrupted flags of the agents that had their
          episode terminated last step.
         """
         pass

diff --git a/ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py b/ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py
@@ -128,7 +128,7 @@ def proto_from_steps(
         agent_id_index = terminal_steps.agent_id_to_index[agent_id]
         reward = terminal_steps.reward[agent_id_index]
         done = True
-        max_step_reached = terminal_steps.max_step[agent_id_index]
+        max_step_reached = terminal_steps.interrupted[agent_id_index]
 
         final_observations: List[ObservationProto] = []
         for all_observations_of_type in terminal_steps.obs:
@@ -248,7 +248,7 @@ def test_batched_step_result_from_proto():
     for agent_id in range(n_agents):
         assert (agent_id in terminal_steps) == (agent_id % 2 == 0)
         if agent_id in terminal_steps:
-            assert terminal_steps[agent_id].max_step == (agent_id % 4 == 0)
+            assert terminal_steps[agent_id].interrupted == (agent_id % 4 == 0)
     assert decision_steps.obs[0].shape[1] == shapes[0][0]
     assert decision_steps.obs[1].shape[1] == shapes[1][0]
     assert terminal_steps.obs[0].shape[1] == shapes[0][0]

diff --git a/ml-agents/mlagents/trainers/agent_processor.py b/ml-agents/mlagents/trainers/agent_processor.py
@@ -125,7 +125,7 @@ def _process_step(
             else:
                 memory = None
             done = terminated  # Since this is an ongoing step
-            max_step = step.max_step if terminated else False
+            interrupted = step.interrupted if terminated else False
             # Add the outputs of the last eval
             action = stored_take_action_outputs["action"][idx]
             if self.policy.use_continuous_act:
@@ -144,7 +144,7 @@ def _process_step(
                 action_pre=action_pre,
                 action_mask=action_mask,
                 prev_action=prev_action,
-                max_step=max_step,
+                interrupted=interrupted,
                 memory=memory,
             )
             # Add the value outputs if needed

diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -102,7 +102,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
         value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(
             agent_buffer_trajectory,
             trajectory.next_obs,
-            trajectory.done_reached and not trajectory.max_step_reached,
+            trajectory.done_reached and not trajectory.interrupted,
         )
         for name, v in value_estimates.items():
             agent_buffer_trajectory["{}_value_estimates".format(name)].extend(v)

diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py
@@ -193,7 +193,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
 
         # Bootstrap using the last step rather than the bootstrap step if max step is reached.
         # Set last element to duplicate obs and remove dones.
-        if last_step.max_step:
+        if last_step.interrupted:
             vec_vis_obs = SplitObservations.from_observations(last_step.obs)
             for i, obs in enumerate(vec_vis_obs.visual_observations):
                 agent_buffer_trajectory["next_visual_obs%d" % i][-1] = obs

diff --git a/ml-agents/mlagents/trainers/tests/mock_brain.py b/ml-agents/mlagents/trainers/tests/mock_brain.py
@@ -75,7 +75,7 @@ def create_mock_steps(
         ]
 
     reward = np.array(num_agents * [1.0], dtype=np.float32)
-    max_step = np.array(num_agents * [False], dtype=np.bool)
+    interrupted = np.array(num_agents * [False], dtype=np.bool)
     agent_id = np.arange(num_agents, dtype=np.int32)
     behavior_spec = BehaviorSpec(
         [(84, 84, 3)] * num_vis_observations + [(num_vector_observations, 0, 0)],
@@ -85,7 +85,7 @@ def create_mock_steps(
     if done:
         return (
             DecisionSteps.empty(behavior_spec),
-            TerminalSteps(obs_list, reward, max_step, agent_id),
+            TerminalSteps(obs_list, reward, interrupted, agent_id),
         )
     else:
         return (
@@ -156,7 +156,7 @@ def make_fake_trajectory(
             action_pre=action_pre,
             action_mask=action_mask,
             prev_action=prev_action,
-            max_step=max_step,
+            interrupted=max_step,
             memory=memory,
         )
         steps_list.append(experience)
@@ -169,7 +169,7 @@ def make_fake_trajectory(
         action_pre=action_pre,
         action_mask=action_mask,
         prev_action=prev_action,
-        max_step=max_step_complete,
+        interrupted=max_step_complete,
         memory=memory,
     )
     steps_list.append(last_experience)

diff --git a/ml-agents/mlagents/trainers/trajectory.py b/ml-agents/mlagents/trainers/trajectory.py
@@ -13,7 +13,7 @@ class AgentExperience(NamedTuple):
     action_pre: np.ndarray  # TODO: Remove this
     action_mask: np.ndarray
     prev_action: np.ndarray
-    max_step: bool
+    interrupted: bool
     memory: np.ndarray
 
 
@@ -141,8 +141,8 @@ def done_reached(self) -> bool:
         return self.steps[-1].done
 
     @property
-    def max_step_reached(self) -> bool:
+    def interrupted(self) -> bool:
         """
         Returns true if trajectory was terminated because max steps was reached.
         """
-        return self.steps[-1].max_step
+        return self.steps[-1].interrupted