Unity-Technologies · ervteng · Mar 4, 2021 · Nov 14, 2020 · Nov 14, 2020 · Nov 14, 2020
diff --git a/ml-agents-envs/mlagents_envs/base_env.py b/ml-agents-envs/mlagents_envs/base_env.py
@@ -34,6 +34,7 @@
 from mlagents_envs.exception import UnityActionException
 
 AgentId = int
+GroupId = int
 BehaviorName = str
 
 
@@ -172,7 +173,7 @@ class TerminalStep(NamedTuple):
     reward: float
     interrupted: bool
     agent_id: AgentId
-    group_id: int
+    group_id: GroupId
     group_reward: float
 
 

diff --git a/ml-agents/mlagents/trainers/agent_processor.py b/ml-agents/mlagents/trainers/agent_processor.py
diff --git a/ml-agents/mlagents/trainers/behavior_id_utils.py b/ml-agents/mlagents/trainers/behavior_id_utils.py
@@ -1,5 +1,9 @@
 from typing import NamedTuple
 from urllib.parse import urlparse, parse_qs
+from mlagents_envs.base_env import AgentId, GroupId
+
+GlobalGroupId = str
+GlobalAgentId = str
 
 
 class BehaviorIdentifiers(NamedTuple):
@@ -46,8 +50,15 @@ def create_name_behavior_id(name: str, team_id: int) -> str:
     return name + "?team=" + str(team_id)
 
 
-def get_global_agent_id(worker_id: int, agent_id: int) -> str:
+def get_global_agent_id(worker_id: int, agent_id: AgentId) -> GlobalAgentId:
     """
     Create an agent id that is unique across environment workers using the worker_id.
     """
-    return f"${worker_id}-{agent_id}"
+    return f"agent_{worker_id}-{agent_id}"
+
+
+def get_global_group_id(worker_id: int, group_id: GroupId) -> GlobalGroupId:
+    """
+    Create a group id that is unique across environment workers when using the worker_id.
+    """
+    return f"group_{worker_id}-{group_id}"
diff --git a/ml-agents/mlagents/trainers/buffer.py b/ml-agents/mlagents/trainers/buffer.py
@@ -9,6 +9,10 @@
 
 from mlagents_envs.exception import UnityException
 
+# Elements in the buffer can be np.ndarray, or in the case of teammate obs, actions, rewards,
+# a List of np.ndarray. This is done so that we don't have duplicated np.ndarrays, only references.
+BufferEntry = Union[np.ndarray, List[np.ndarray]]
+
 
 class BufferException(UnityException):
     """
@@ -21,8 +25,10 @@ class BufferException(UnityException):
 class BufferKey(enum.Enum):
     ACTION_MASK = "action_mask"
     CONTINUOUS_ACTION = "continuous_action"
+    NEXT_CONT_ACTION = "next_continuous_action"
     CONTINUOUS_LOG_PROBS = "continuous_log_probs"
     DISCRETE_ACTION = "discrete_action"
+    NEXT_DISC_ACTION = "next_discrete_action"
     DISCRETE_LOG_PROBS = "discrete_log_probs"
     DONE = "done"
     ENVIRONMENT_REWARDS = "environment_rewards"
@@ -34,11 +40,22 @@ class BufferKey(enum.Enum):
     ADVANTAGES = "advantages"
     DISCOUNTED_RETURNS = "discounted_returns"
 
+    GROUP_DONES = "group_dones"
+    GROUPMATE_REWARDS = "groupmate_reward"
+    GROUP_REWARD = "group_reward"
+    GROUP_CONTINUOUS_ACTION = "group_continuous_action"
+    GROUP_DISCRETE_ACTION = "group_discrete_aaction"
+    GROUP_NEXT_CONT_ACTION = "group_next_cont_action"
+    GROUP_NEXT_DISC_ACTION = "group_next_disc_action"
+
 
 class ObservationKeyPrefix(enum.Enum):
     OBSERVATION = "obs"
     NEXT_OBSERVATION = "next_obs"
 
+    GROUP_OBSERVATION = "group_obs"
+    NEXT_GROUP_OBSERVATION = "next_group_obs"
+
 
 class RewardSignalKeyPrefix(enum.Enum):
     # Reward signals
@@ -73,16 +90,23 @@ def advantage_key(name: str) -> AgentBufferKey:
 
 class AgentBufferField(list):
     """
-    AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to its
-    AgentBufferField with the append method.
+    AgentBufferField is a list of numpy arrays, or List[np.ndarray] for group entries.
+    When an agent collects a field, you can add it to its AgentBufferField with the append method.
     """
 
-    def __init__(self):
+    def __init__(self, *args, **kwargs):
         self.padding_value = 0
-        super().__init__()
+        super().__init__(*args, **kwargs)
 
-    def __str__(self):
-        return str(np.array(self).shape)
+    def __str__(self) -> str:
+        return f"AgentBufferField: {super().__str__()}"
+
+    def __getitem__(self, index):
+        return_data = super().__getitem__(index)
+        if isinstance(return_data, list):
+            return AgentBufferField(return_data)
+        else:
+            return return_data
 
     def append(self, element: np.ndarray, padding_value: float = 0.0) -> None:
         """
@@ -95,31 +119,20 @@ def append(self, element: np.ndarray, padding_value: float = 0.0) -> None:
         super().append(element)
         self.padding_value = padding_value
 
-    def extend(self, data: np.ndarray) -> None:
-        """
-        Adds a list of np.arrays to the end of the list of np.arrays.
-        :param data: The np.array list to append.
-        """
-        self += list(np.array(data, dtype=np.float32))
-
-    def set(self, data):
+    def set(self, data: List[BufferEntry]) -> None:
         """
-        Sets the list of np.array to the input data
-        :param data: The np.array list to be set.
+        Sets the list of BufferEntry to the input data
+        :param data: The BufferEntry list to be set.
         """
-        # Make sure we convert incoming data to float32 if it's a float
-        dtype = None
-        if data is not None and len(data) and isinstance(data[0], float):
-            dtype = np.float32
         self[:] = []
-        self[:] = list(np.array(data, dtype=dtype))
+        self[:] = data
 
     def get_batch(
         self,
         batch_size: int = None,
         training_length: Optional[int] = 1,
         sequential: bool = True,
-    ) -> np.ndarray:
+    ) -> List[BufferEntry]:
         """
         Retrieve the last batch_size elements of length training_length
         from the list of np.array
@@ -150,13 +163,10 @@ def get_batch(
                 )
             if batch_size * training_length > len(self):
                 padding = np.array(self[-1], dtype=np.float32) * self.padding_value
-                return np.array(
-                    [padding] * (training_length - leftover) + self[:], dtype=np.float32
-                )
+                return [padding] * (training_length - leftover) + self[:]
+
             else:
-                return np.array(
-                    self[len(self) - batch_size * training_length :], dtype=np.float32
-                )
+                return self[len(self) - batch_size * training_length :]
         else:
             # The sequences will have overlapping elements
             if batch_size is None:
@@ -172,14 +182,52 @@ def get_batch(
             tmp_list: List[np.ndarray] = []
             for end in range(len(self) - batch_size + 1, len(self) + 1):
                 tmp_list += self[end - training_length : end]
-            return np.array(tmp_list, dtype=np.float32)
+            return tmp_list
 
     def reset_field(self) -> None:
         """
         Resets the AgentBufferField
         """
         self[:] = []
 
+    def padded_to_batch(
+        self, pad_value: np.float = 0, dtype: np.dtype = np.float32
+    ) -> Union[np.ndarray, List[np.ndarray]]:
+        """
+        Converts this AgentBufferField (which is a List[BufferEntry]) into a numpy array
+        with first dimension equal to the length of this AgentBufferField. If this AgentBufferField
+        contains a List[List[BufferEntry]] (i.e., in the case of group observations), return a List
+        containing numpy arrays or tensors, of length equal to the maximum length of an entry. Missing
+        For entries with less than that length, the array will be padded with pad_value.
+        :param pad_value: Value to pad List AgentBufferFields, when there are less than the maximum
+            number of agents present.
+        :param dtype: Dtype of output numpy array.
+        :return: Numpy array or List of numpy arrays representing this AgentBufferField, where the first
+            dimension is equal to the length of the AgentBufferField.
+        """
+        if len(self) > 0 and not isinstance(self[0], list):
+            return np.asanyarray(self, dytpe=dtype)
+
+        shape = None
+        for _entry in self:
+            # _entry could be an empty list if there are no group agents in this
+            # step. Find the first non-empty list and use that shape.
+            if _entry:
+                shape = _entry[0].shape
+                break
+        # If there were no groupmate agents in the entire batch, return an empty List.
+        if shape is None:
+            return []
+
+        # Convert to numpy array while padding with 0's
+        new_list = list(
+            map(
+                lambda x: np.asanyarray(x, dtype=dtype),
+                itertools.zip_longest(*self, fillvalue=np.full(shape, pad_value)),
+            )
+        )
+        return new_list
+
 
 class AgentBuffer(MutableMapping):
     """

diff --git a/ml-agents/mlagents/trainers/policy/policy.py b/ml-agents/mlagents/trainers/policy/policy.py
@@ -8,6 +8,7 @@
 from mlagents.trainers.action_info import ActionInfo
 from mlagents.trainers.settings import TrainerSettings, NetworkSettings
 from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.behavior_id_utils import GlobalAgentId
 
 
 class UnityPolicyException(UnityException):
@@ -68,7 +69,7 @@ def make_empty_memory(self, num_agents):
         return np.zeros((num_agents, self.m_size), dtype=np.float32)
 
     def save_memories(
-        self, agent_ids: List[str], memory_matrix: Optional[np.ndarray]
+        self, agent_ids: List[GlobalAgentId], memory_matrix: Optional[np.ndarray]
     ) -> None:
         if memory_matrix is None:
             return
@@ -81,21 +82,21 @@ def save_memories(
         for index, agent_id in enumerate(agent_ids):
             self.memory_dict[agent_id] = memory_matrix[index, :]
 
-    def retrieve_memories(self, agent_ids: List[str]) -> np.ndarray:
+    def retrieve_memories(self, agent_ids: List[GlobalAgentId]) -> np.ndarray:
         memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
         for index, agent_id in enumerate(agent_ids):
             if agent_id in self.memory_dict:
                 memory_matrix[index, :] = self.memory_dict[agent_id]
         return memory_matrix
 
-    def retrieve_previous_memories(self, agent_ids: List[str]) -> np.ndarray:
+    def retrieve_previous_memories(self, agent_ids: List[GlobalAgentId]) -> np.ndarray:
         memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
         for index, agent_id in enumerate(agent_ids):
             if agent_id in self.previous_memory_dict:
                 memory_matrix[index, :] = self.previous_memory_dict[agent_id]
         return memory_matrix
 
-    def remove_memories(self, agent_ids):
+    def remove_memories(self, agent_ids: List[GlobalAgentId]) -> None:
         for agent_id in agent_ids:
             if agent_id in self.memory_dict:
                 self.memory_dict.pop(agent_id)
@@ -113,19 +114,19 @@ def make_empty_previous_action(self, num_agents: int) -> np.ndarray:
         )
 
     def save_previous_action(
-        self, agent_ids: List[str], action_tuple: ActionTuple
+        self, agent_ids: List[GlobalAgentId], action_tuple: ActionTuple
     ) -> None:
         for index, agent_id in enumerate(agent_ids):
             self.previous_action_dict[agent_id] = action_tuple.discrete[index, :]
 
-    def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
+    def retrieve_previous_action(self, agent_ids: List[GlobalAgentId]) -> np.ndarray:
         action_matrix = self.make_empty_previous_action(len(agent_ids))
         for index, agent_id in enumerate(agent_ids):
             if agent_id in self.previous_action_dict:
                 action_matrix[index, :] = self.previous_action_dict[agent_id]
         return action_matrix
 
-    def remove_previous_action(self, agent_ids):
+    def remove_previous_action(self, agent_ids: List[GlobalAgentId]) -> None:
         for agent_id in agent_ids:
             if agent_id in self.previous_action_dict:
                 self.previous_action_dict.pop(agent_id)

diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -180,7 +180,7 @@ def _update_policy(self):
             int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
         )
 
-        advantages = self.update_buffer[BufferKey.ADVANTAGES].get_batch()
+        advantages = np.array(self.update_buffer[BufferKey.ADVANTAGES].get_batch())
         self.update_buffer[BufferKey.ADVANTAGES].set(
             (advantages - advantages.mean()) / (advantages.std() + 1e-10)
         )

diff --git a/ml-agents/mlagents/trainers/tests/mock_brain.py b/ml-agents/mlagents/trainers/tests/mock_brain.py
@@ -3,7 +3,7 @@
 
 from mlagents.trainers.buffer import AgentBuffer, AgentBufferKey
 from mlagents.trainers.torch.action_log_probs import LogProbsTuple
-from mlagents.trainers.trajectory import Trajectory, AgentExperience
+from mlagents.trainers.trajectory import AgentStatus, Trajectory, AgentExperience
 from mlagents_envs.base_env import (
     DecisionSteps,
     TerminalSteps,
@@ -20,6 +20,7 @@ def create_mock_steps(
     observation_specs: List[ObservationSpec],
     action_spec: ActionSpec,
     done: bool = False,
+    grouped: bool = False,
 ) -> Tuple[DecisionSteps, TerminalSteps]:
     """
     Creates a mock Tuple[DecisionSteps, TerminalSteps] with observations.
@@ -43,7 +44,8 @@ def create_mock_steps(
     reward = np.array(num_agents * [1.0], dtype=np.float32)
     interrupted = np.array(num_agents * [False], dtype=np.bool)
     agent_id = np.arange(num_agents, dtype=np.int32)
-    group_id = np.array(num_agents * [0], dtype=np.int32)
+    _gid = 1 if grouped else 0
+    group_id = np.array(num_agents * [_gid], dtype=np.int32)
     group_reward = np.array(num_agents * [0.0], dtype=np.float32)
     behavior_spec = BehaviorSpec(observation_specs, action_spec)
     if done:
@@ -78,6 +80,7 @@ def make_fake_trajectory(
     action_spec: ActionSpec,
     max_step_complete: bool = False,
     memory_size: int = 10,
+    num_other_agents_in_group: int = 0,
 ) -> Trajectory:
     """
     Makes a fake trajectory of length length. If max_step_complete,
@@ -117,6 +120,9 @@ def make_fake_trajectory(
         memory = np.ones(memory_size, dtype=np.float32)
         agent_id = "test_agent"
         behavior_id = "test_brain"
+        group_status = []
+        for _ in range(num_other_agents_in_group):
+            group_status.append(AgentStatus(obs, reward, action, done))
         experience = AgentExperience(
             obs=obs,
             reward=reward,
@@ -127,6 +133,8 @@ def make_fake_trajectory(
             prev_action=prev_action,
             interrupted=max_step,
             memory=memory,
+            group_status=group_status,
+            group_reward=0,
         )
         steps_list.append(experience)
     obs = []
@@ -142,10 +150,16 @@ def make_fake_trajectory(
         prev_action=prev_action,
         interrupted=max_step_complete,
         memory=memory,
+        group_status=group_status,
+        group_reward=0,
     )
     steps_list.append(last_experience)
     return Trajectory(
-        steps=steps_list, agent_id=agent_id, behavior_id=behavior_id, next_obs=obs
+        steps=steps_list,
+        agent_id=agent_id,
+        behavior_id=behavior_id,
+        next_obs=obs,
+        next_group_obs=[obs] * num_other_agents_in_group,
     )