From b64e5b84f83c3665924d33ee2ac31f488f66a9cb Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Tue, 30 Mar 2021 18:45:31 -0400 Subject: [PATCH 01/13] Pad buffer at the end --- ml-agents/mlagents/trainers/buffer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/buffer.py b/ml-agents/mlagents/trainers/buffer.py index 0fd5cdaa3e..3c8552d11d 100644 --- a/ml-agents/mlagents/trainers/buffer.py +++ b/ml-agents/mlagents/trainers/buffer.py @@ -180,7 +180,7 @@ def get_batch( else: # We want to duplicate the last value in the array, multiplied by the padding_value. padding = np.array(self[-1], dtype=np.float32) * self.padding_value - return [padding] * (training_length - leftover) + self[:] + return self[:] + [padding] * (training_length - leftover) else: return self[len(self) - batch_size * training_length :] From 29af3433f8a90cad748a3ebf5da52692978f8c14 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 31 Mar 2021 10:54:26 -0400 Subject: [PATCH 02/13] Fix padding in optimizer value estimate --- .../trainers/optimizer/torch_optimizer.py | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py index 8609482b4c..aeac1ddcb4 100644 --- a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py +++ b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py @@ -2,6 +2,7 @@ from mlagents.torch_utils import torch import numpy as np import math +from collections import defaultdict from mlagents.trainers.buffer import AgentBuffer, AgentBufferField from mlagents.trainers.trajectory import ObsUtil @@ -81,41 +82,19 @@ def _evaluate_by_sequence( # Compute the number of elements in this padded seq. leftover = num_experiences % self.policy.sequence_length - # Compute values for the potentially truncated initial sequence - seq_obs = [] - - first_seq_len = leftover if leftover > 0 else self.policy.sequence_length - for _obs in tensor_obs: - first_seq_obs = _obs[0:first_seq_len] - seq_obs.append(first_seq_obs) - - # For the first sequence, the initial memory should be the one at the - # beginning of this trajectory. - for _ in range(first_seq_len): - all_next_memories.append(ModelUtils.to_numpy(initial_memory.squeeze())) - - init_values, _mem = self.critic.critic_pass( - seq_obs, initial_memory, sequence_length=first_seq_len - ) - all_values = { - signal_name: [init_values[signal_name]] - for signal_name in init_values.keys() - } - + all_values: Dict[str, List[np.ndarray]] = defaultdict(list) + _mem = initial_memory # Evaluate other trajectories, carrying over _mem after each # trajectory for seq_num in range( - 1, math.ceil((num_experiences) / (self.policy.sequence_length)) + 0, math.floor((num_experiences) / (self.policy.sequence_length)) ): seq_obs = [] for _ in range(self.policy.sequence_length): all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze())) - start = seq_num * self.policy.sequence_length - ( - self.policy.sequence_length - leftover - ) - end = (seq_num + 1) * self.policy.sequence_length - ( - self.policy.sequence_length - leftover - ) + start = seq_num * self.policy.sequence_length + end = (seq_num + 1) * self.policy.sequence_length + for _obs in tensor_obs: seq_obs.append(_obs[start:end]) values, _mem = self.critic.critic_pass( @@ -123,6 +102,27 @@ def _evaluate_by_sequence( ) for signal_name, _val in values.items(): all_values[signal_name].append(_val) + + # Compute values for the potentially truncated last sequence + seq_obs = [] + + last_seq_len = leftover + if last_seq_len > 0: + for _obs in tensor_obs: + last_seq_obs = _obs[0:last_seq_len] + seq_obs.append(last_seq_obs) + + # For the first sequence, the initial memory should be the one at the + # beginning of this trajectory. + for _ in range(last_seq_len): + all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze())) + + last_values, _mem = self.critic.critic_pass( + seq_obs, _mem, sequence_length=last_seq_len + ) + for signal_name, _val in last_values.items(): + all_values[signal_name].append(_val) + # Create one tensor per reward signal all_value_tensors = { signal_name: torch.cat(value_list, dim=0) From 39f6d57be376b9e1f1275b95cde055a6e714b75b Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 31 Mar 2021 13:56:33 -0400 Subject: [PATCH 03/13] Fix additional bugs and POCA --- .../trainers/optimizer/torch_optimizer.py | 10 +- .../mlagents/trainers/poca/optimizer_torch.py | 131 +++++++++--------- 2 files changed, 68 insertions(+), 73 deletions(-) diff --git a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py index aeac1ddcb4..ba5ca7b267 100644 --- a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py +++ b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py @@ -77,8 +77,8 @@ def _evaluate_by_sequence( """ num_experiences = tensor_obs[0].shape[0] all_next_memories = AgentBufferField() - # In the buffer, the 1st sequence are the ones that are padded. So if seq_len = 3 and - # trajectory is of length 10, the 1st sequence is [pad,pad,obs]. + # In the buffer, the last sequence are the ones that are padded. So if seq_len = 3 and + # trajectory is of length 10, the last sequence is [obs,pad,pad]. # Compute the number of elements in this padded seq. leftover = num_experiences % self.policy.sequence_length @@ -109,11 +109,11 @@ def _evaluate_by_sequence( last_seq_len = leftover if last_seq_len > 0: for _obs in tensor_obs: - last_seq_obs = _obs[0:last_seq_len] + last_seq_obs = _obs[-last_seq_len:] seq_obs.append(last_seq_obs) - # For the first sequence, the initial memory should be the one at the - # beginning of this trajectory. + # For the last sequence, the initial memory should be the one at the + # end of this trajectory. for _ in range(last_seq_len): all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze())) diff --git a/ml-agents/mlagents/trainers/poca/optimizer_torch.py b/ml-agents/mlagents/trainers/poca/optimizer_torch.py index 5282d190f7..73fb5bc6fb 100644 --- a/ml-agents/mlagents/trainers/poca/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/poca/optimizer_torch.py @@ -1,4 +1,5 @@ from typing import Dict, cast, List, Tuple, Optional +from collections import defaultdict from mlagents.trainers.torch.components.reward_providers.extrinsic_reward_provider import ( ExtrinsicRewardProvider, ) @@ -381,66 +382,20 @@ def _evaluate_by_sequence_team( num_experiences = self_obs[0].shape[0] all_next_value_mem = AgentBufferField() all_next_baseline_mem = AgentBufferField() - # In the buffer, the 1st sequence are the ones that are padded. So if seq_len = 3 and - # trajectory is of length 10, the 1st sequence is [pad,pad,obs]. + # In the buffer, the last sequence are the ones that are padded. So if seq_len = 3 and + # trajectory is of length 10, the last sequence is [obs,pad,pad]. # Compute the number of elements in this padded seq. leftover = num_experiences % self.policy.sequence_length - # Compute values for the potentially truncated initial sequence - - first_seq_len = leftover if leftover > 0 else self.policy.sequence_length - - self_seq_obs = [] - groupmate_seq_obs = [] - groupmate_seq_act = [] - seq_obs = [] - for _self_obs in self_obs: - first_seq_obs = _self_obs[0:first_seq_len] - seq_obs.append(first_seq_obs) - self_seq_obs.append(seq_obs) - - for groupmate_obs, groupmate_action in zip(obs, actions): - seq_obs = [] - for _obs in groupmate_obs: - first_seq_obs = _obs[0:first_seq_len] - seq_obs.append(first_seq_obs) - groupmate_seq_obs.append(seq_obs) - _act = groupmate_action.slice(0, first_seq_len) - groupmate_seq_act.append(_act) - - # For the first sequence, the initial memory should be the one at the - # beginning of this trajectory. - for _ in range(first_seq_len): - all_next_value_mem.append(ModelUtils.to_numpy(init_value_mem.squeeze())) - all_next_baseline_mem.append( - ModelUtils.to_numpy(init_baseline_mem.squeeze()) - ) - - all_seq_obs = self_seq_obs + groupmate_seq_obs - init_values, _value_mem = self.critic.critic_pass( - all_seq_obs, init_value_mem, sequence_length=first_seq_len - ) - all_values = { - signal_name: [init_values[signal_name]] - for signal_name in init_values.keys() - } - - groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act) - init_baseline, _baseline_mem = self.critic.baseline( - self_seq_obs[0], - groupmate_obs_and_actions, - init_baseline_mem, - sequence_length=first_seq_len, - ) - all_baseline = { - signal_name: [init_baseline[signal_name]] - for signal_name in init_baseline.keys() - } + all_values: Dict[str, List[np.ndarray]] = defaultdict(list) + all_baseline: Dict[str, List[np.ndarray]] = defaultdict(list) + _baseline_mem = init_baseline_mem + _value_mem = init_value_mem # Evaluate other trajectories, carrying over _mem after each # trajectory for seq_num in range( - 1, math.ceil((num_experiences) / (self.policy.sequence_length)) + 0, math.floor((num_experiences) / (self.policy.sequence_length)) ): for _ in range(self.policy.sequence_length): all_next_value_mem.append(ModelUtils.to_numpy(_value_mem.squeeze())) @@ -448,19 +403,15 @@ def _evaluate_by_sequence_team( ModelUtils.to_numpy(_baseline_mem.squeeze()) ) - start = seq_num * self.policy.sequence_length - ( - self.policy.sequence_length - leftover - ) - end = (seq_num + 1) * self.policy.sequence_length - ( - self.policy.sequence_length - leftover - ) + start = seq_num * self.policy.sequence_length + end = (seq_num + 1) * self.policy.sequence_length self_seq_obs = [] groupmate_seq_obs = [] groupmate_seq_act = [] seq_obs = [] for _self_obs in self_obs: - seq_obs.append(_obs[start:end]) + seq_obs.append(_self_obs[start:end]) self_seq_obs.append(seq_obs) for groupmate_obs, team_action in zip(obs, actions): @@ -476,21 +427,65 @@ def _evaluate_by_sequence_team( values, _value_mem = self.critic.critic_pass( all_seq_obs, _value_mem, sequence_length=self.policy.sequence_length ) - all_values = { - signal_name: [init_values[signal_name]] for signal_name in values.keys() - } + for signal_name, _val in values.items(): + all_values[signal_name].append(_val) groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act) baselines, _baseline_mem = self.critic.baseline( self_seq_obs[0], groupmate_obs_and_actions, _baseline_mem, - sequence_length=first_seq_len, + sequence_length=self.policy.sequence_length, + ) + for signal_name, _val in baselines.items(): + all_baseline[signal_name].append(_val) + + # Compute values for the potentially truncated initial sequence + last_seq_len = leftover + if last_seq_len > 0: + self_seq_obs = [] + groupmate_seq_obs = [] + groupmate_seq_act = [] + seq_obs = [] + for _self_obs in self_obs: + last_seq_obs = _self_obs[-last_seq_len:] + seq_obs.append(last_seq_obs) + self_seq_obs.append(seq_obs) + + for groupmate_obs, groupmate_action in zip(obs, actions): + seq_obs = [] + for _obs in groupmate_obs: + last_seq_obs = _obs[-last_seq_len:] + seq_obs.append(last_seq_obs) + groupmate_seq_obs.append(seq_obs) + _act = groupmate_action.slice(len(_obs) - last_seq_len, len(_obs)) + groupmate_seq_act.append(_act) + + # For the last sequence, the initial memory should be the one at the + # beginning of this trajectory. + seq_obs = [] + last_seq_len = leftover + for _ in range(last_seq_len): + all_next_value_mem.append(ModelUtils.to_numpy(_value_mem.squeeze())) + all_next_baseline_mem.append( + ModelUtils.to_numpy(_baseline_mem.squeeze()) + ) + + all_seq_obs = self_seq_obs + groupmate_seq_obs + last_values, _value_mem = self.critic.critic_pass( + all_seq_obs, _value_mem, sequence_length=last_seq_len + ) + for signal_name, _val in last_values.items(): + all_values[signal_name].append(_val) + groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act) + last_baseline, _baseline_mem = self.critic.baseline( + self_seq_obs[0], + groupmate_obs_and_actions, + _baseline_mem, + sequence_length=last_seq_len, ) - all_baseline = { - signal_name: [baselines[signal_name]] - for signal_name in baselines.keys() - } + for signal_name, _val in last_baseline.items(): + all_baseline[signal_name].append(_val) # Create one tensor per reward signal all_value_tensors = { signal_name: torch.cat(value_list, dim=0) From 48dafb6eb176b8460de11e7fd88e273326a93746 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 31 Mar 2021 14:11:40 -0400 Subject: [PATCH 04/13] Fix groupmate obs, add tests --- ml-agents/mlagents/trainers/poca/optimizer_torch.py | 10 +++++----- ml-agents/mlagents/trainers/tests/torch/test_poca.py | 2 +- ml-agents/mlagents/trainers/tests/torch/test_ppo.py | 4 +++- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/ml-agents/mlagents/trainers/poca/optimizer_torch.py b/ml-agents/mlagents/trainers/poca/optimizer_torch.py index 73fb5bc6fb..108520f38b 100644 --- a/ml-agents/mlagents/trainers/poca/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/poca/optimizer_torch.py @@ -414,13 +414,13 @@ def _evaluate_by_sequence_team( seq_obs.append(_self_obs[start:end]) self_seq_obs.append(seq_obs) - for groupmate_obs, team_action in zip(obs, actions): + for groupmate_obs, groupmate_action in zip(obs, actions): seq_obs = [] - for (_obs,) in groupmate_obs: - first_seq_obs = _obs[start:end] - seq_obs.append(first_seq_obs) + for _obs in groupmate_obs: + sliced_seq_obs = _obs[start:end] + seq_obs.append(sliced_seq_obs) groupmate_seq_obs.append(seq_obs) - _act = team_action.slice(start, end) + _act = groupmate_action.slice(start, end) groupmate_seq_act.append(_act) all_seq_obs = self_seq_obs + groupmate_seq_obs diff --git a/ml-agents/mlagents/trainers/tests/torch/test_poca.py b/ml-agents/mlagents/trainers/tests/torch/test_poca.py index 2552234bb0..ed22516830 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_poca.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_poca.py @@ -60,7 +60,7 @@ def create_test_poca_optimizer(dummy_config, use_rnn, use_discrete, use_visual): } trainer_settings.network_settings.memory = ( - NetworkSettings.MemorySettings(sequence_length=16, memory_size=10) + NetworkSettings.MemorySettings(sequence_length=8, memory_size=10) if use_rnn else None ) diff --git a/ml-agents/mlagents/trainers/tests/torch/test_ppo.py b/ml-agents/mlagents/trainers/tests/torch/test_ppo.py index ca1a18b1e7..cca4442498 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_ppo.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_ppo.py @@ -46,7 +46,7 @@ def create_test_ppo_optimizer(dummy_config, use_rnn, use_discrete, use_visual): trainer_settings = attr.evolve(dummy_config) trainer_settings.network_settings.memory = ( - NetworkSettings.MemorySettings(sequence_length=16, memory_size=10) + NetworkSettings.MemorySettings(sequence_length=8, memory_size=10) if use_rnn else None ) @@ -197,6 +197,8 @@ def test_ppo_get_value_estimates(dummy_config, rnn, visual, discrete): optimizer = create_test_ppo_optimizer( dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual ) + # Time horizon is longer than sequence length, make sure to test + # process trajectory on multiple sequences in trajectory + some padding time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, From 0a5b798a8a9e03de199e8d9d411c853941dd6c89 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 31 Mar 2021 14:25:00 -0400 Subject: [PATCH 05/13] Update changelog --- com.unity.ml-agents/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md index 3a8cdaac99..8018c2cba7 100755 --- a/com.unity.ml-agents/CHANGELOG.md +++ b/com.unity.ml-agents/CHANGELOG.md @@ -45,7 +45,7 @@ depend on the previous behavior, you can explicitly set the Agent's `InferenceDe ### Bug Fixes #### com.unity.ml-agents / com.unity.ml-agents.extensions (C#) #### ml-agents / ml-agents-envs / gym-unity (Python) - +- Fixed an issue with LSTM when used with POCA and `sequence_length` < `time_horizon`. Also improved behavior of LSTMs slightly. (#5206) ## [1.9.0-preview] - 2021-03-17 ### Major Changes From be54401440334730dcac8623999beea0cd7bc132 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 31 Mar 2021 17:40:42 -0400 Subject: [PATCH 06/13] Improve tests --- ml-agents/mlagents/trainers/tests/torch/test_poca.py | 2 +- ml-agents/mlagents/trainers/tests/torch/test_ppo.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ml-agents/mlagents/trainers/tests/torch/test_poca.py b/ml-agents/mlagents/trainers/tests/torch/test_poca.py index ed22516830..d48ae6107c 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_poca.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_poca.py @@ -125,7 +125,7 @@ def test_poca_get_value_estimates(dummy_config, rnn, visual, discrete): optimizer = create_test_poca_optimizer( dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual ) - time_horizon = 15 + time_horizon = 30 trajectory = make_fake_trajectory( length=time_horizon, observation_specs=optimizer.policy.behavior_spec.observation_specs, diff --git a/ml-agents/mlagents/trainers/tests/torch/test_ppo.py b/ml-agents/mlagents/trainers/tests/torch/test_ppo.py index cca4442498..743bd824b7 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_ppo.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_ppo.py @@ -46,7 +46,7 @@ def create_test_ppo_optimizer(dummy_config, use_rnn, use_discrete, use_visual): trainer_settings = attr.evolve(dummy_config) trainer_settings.network_settings.memory = ( - NetworkSettings.MemorySettings(sequence_length=8, memory_size=10) + NetworkSettings.MemorySettings(sequence_length=16, memory_size=10) if use_rnn else None ) @@ -199,7 +199,7 @@ def test_ppo_get_value_estimates(dummy_config, rnn, visual, discrete): ) # Time horizon is longer than sequence length, make sure to test # process trajectory on multiple sequences in trajectory + some padding - time_horizon = 15 + time_horizon = 30 trajectory = make_fake_trajectory( length=time_horizon, observation_specs=optimizer.policy.behavior_spec.observation_specs, @@ -216,9 +216,9 @@ def test_ppo_get_value_estimates(dummy_config, rnn, visual, discrete): for key, val in run_out.items(): assert type(key) is str - assert len(val) == 15 + assert len(val) == time_horizon if all_memories is not None: - assert len(all_memories) == 15 + assert len(all_memories) == time_horizon run_out, final_value_out, _ = optimizer.get_trajectory_value_estimates( trajectory.to_agentbuffer(), trajectory.next_obs, done=True From 439b1bd3288bcd15b1590f64d00e27ee7080c11f Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 31 Mar 2021 19:01:11 -0400 Subject: [PATCH 07/13] Address comments --- .../trainers/optimizer/torch_optimizer.py | 27 +++++++++---------- .../mlagents/trainers/poca/optimizer_torch.py | 26 +++++++++--------- 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py index ba5ca7b267..04c3968081 100644 --- a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py +++ b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py @@ -1,7 +1,6 @@ from typing import Dict, Optional, Tuple, List from mlagents.torch_utils import torch import numpy as np -import math from collections import defaultdict from mlagents.trainers.buffer import AgentBuffer, AgentBufferField @@ -77,18 +76,18 @@ def _evaluate_by_sequence( """ num_experiences = tensor_obs[0].shape[0] all_next_memories = AgentBufferField() - # In the buffer, the last sequence are the ones that are padded. So if seq_len = 3 and - # trajectory is of length 10, the last sequence is [obs,pad,pad]. - # Compute the number of elements in this padded seq. - leftover = num_experiences % self.policy.sequence_length + # When using LSTM, we need to divide the trajectory into sequences of even length. Sometimes, + # that division isn't even, and we must pad the leftover sequence. + # When it is added to the buffer, the last sequence will be padded. So if seq_len = 3 and + # trajectory is of length 10, the last sequence is [obs,pad,pad] once it is added to the buffer. + # Compute the number of elements in this sequence that will end up being padded. + leftover_seq_len = num_experiences % self.policy.sequence_length all_values: Dict[str, List[np.ndarray]] = defaultdict(list) _mem = initial_memory # Evaluate other trajectories, carrying over _mem after each # trajectory - for seq_num in range( - 0, math.floor((num_experiences) / (self.policy.sequence_length)) - ): + for seq_num in range(num_experiences // (self.policy.sequence_length)): seq_obs = [] for _ in range(self.policy.sequence_length): all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze())) @@ -103,22 +102,22 @@ def _evaluate_by_sequence( for signal_name, _val in values.items(): all_values[signal_name].append(_val) - # Compute values for the potentially truncated last sequence + # Compute values for the potentially truncated last sequence. Note that this + # sequence isn't padded yet, but will be. seq_obs = [] - last_seq_len = leftover - if last_seq_len > 0: + if leftover_seq_len > 0: for _obs in tensor_obs: - last_seq_obs = _obs[-last_seq_len:] + last_seq_obs = _obs[-leftover_seq_len:] seq_obs.append(last_seq_obs) # For the last sequence, the initial memory should be the one at the # end of this trajectory. - for _ in range(last_seq_len): + for _ in range(leftover_seq_len): all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze())) last_values, _mem = self.critic.critic_pass( - seq_obs, _mem, sequence_length=last_seq_len + seq_obs, _mem, sequence_length=leftover_seq_len ) for signal_name, _val in last_values.items(): all_values[signal_name].append(_val) diff --git a/ml-agents/mlagents/trainers/poca/optimizer_torch.py b/ml-agents/mlagents/trainers/poca/optimizer_torch.py index 108520f38b..55cb8ce87e 100644 --- a/ml-agents/mlagents/trainers/poca/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/poca/optimizer_torch.py @@ -4,7 +4,6 @@ ExtrinsicRewardProvider, ) import numpy as np -import math from mlagents.torch_utils import torch, default_device from mlagents.trainers.buffer import ( @@ -382,10 +381,13 @@ def _evaluate_by_sequence_team( num_experiences = self_obs[0].shape[0] all_next_value_mem = AgentBufferField() all_next_baseline_mem = AgentBufferField() + + # When using LSTM, we need to divide the trajectory into sequences of even length. Sometimes, + # that division isn't even, and we must pad the leftover sequence. # In the buffer, the last sequence are the ones that are padded. So if seq_len = 3 and # trajectory is of length 10, the last sequence is [obs,pad,pad]. # Compute the number of elements in this padded seq. - leftover = num_experiences % self.policy.sequence_length + leftover_seq_len = num_experiences % self.policy.sequence_length all_values: Dict[str, List[np.ndarray]] = defaultdict(list) all_baseline: Dict[str, List[np.ndarray]] = defaultdict(list) @@ -394,9 +396,7 @@ def _evaluate_by_sequence_team( # Evaluate other trajectories, carrying over _mem after each # trajectory - for seq_num in range( - 0, math.floor((num_experiences) / (self.policy.sequence_length)) - ): + for seq_num in range(num_experiences // self.policy.sequence_length): for _ in range(self.policy.sequence_length): all_next_value_mem.append(ModelUtils.to_numpy(_value_mem.squeeze())) all_next_baseline_mem.append( @@ -441,31 +441,29 @@ def _evaluate_by_sequence_team( all_baseline[signal_name].append(_val) # Compute values for the potentially truncated initial sequence - last_seq_len = leftover - if last_seq_len > 0: + if leftover_seq_len > 0: self_seq_obs = [] groupmate_seq_obs = [] groupmate_seq_act = [] seq_obs = [] for _self_obs in self_obs: - last_seq_obs = _self_obs[-last_seq_len:] + last_seq_obs = _self_obs[-leftover_seq_len:] seq_obs.append(last_seq_obs) self_seq_obs.append(seq_obs) for groupmate_obs, groupmate_action in zip(obs, actions): seq_obs = [] for _obs in groupmate_obs: - last_seq_obs = _obs[-last_seq_len:] + last_seq_obs = _obs[-leftover_seq_len:] seq_obs.append(last_seq_obs) groupmate_seq_obs.append(seq_obs) - _act = groupmate_action.slice(len(_obs) - last_seq_len, len(_obs)) + _act = groupmate_action.slice(len(_obs) - leftover_seq_len, len(_obs)) groupmate_seq_act.append(_act) # For the last sequence, the initial memory should be the one at the # beginning of this trajectory. seq_obs = [] - last_seq_len = leftover - for _ in range(last_seq_len): + for _ in range(leftover_seq_len): all_next_value_mem.append(ModelUtils.to_numpy(_value_mem.squeeze())) all_next_baseline_mem.append( ModelUtils.to_numpy(_baseline_mem.squeeze()) @@ -473,7 +471,7 @@ def _evaluate_by_sequence_team( all_seq_obs = self_seq_obs + groupmate_seq_obs last_values, _value_mem = self.critic.critic_pass( - all_seq_obs, _value_mem, sequence_length=last_seq_len + all_seq_obs, _value_mem, sequence_length=leftover_seq_len ) for signal_name, _val in last_values.items(): all_values[signal_name].append(_val) @@ -482,7 +480,7 @@ def _evaluate_by_sequence_team( self_seq_obs[0], groupmate_obs_and_actions, _baseline_mem, - sequence_length=last_seq_len, + sequence_length=leftover_seq_len, ) for signal_name, _val in last_baseline.items(): all_baseline[signal_name].append(_val) From 795c269b6b80fe4958383e38e20232c3f174fe1e Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 31 Mar 2021 19:10:15 -0400 Subject: [PATCH 08/13] Fix poca test --- ml-agents/mlagents/trainers/tests/torch/test_poca.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ml-agents/mlagents/trainers/tests/torch/test_poca.py b/ml-agents/mlagents/trainers/tests/torch/test_poca.py index d48ae6107c..01080bf27b 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_poca.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_poca.py @@ -147,14 +147,14 @@ def test_poca_get_value_estimates(dummy_config, rnn, visual, discrete): ) for key, val in value_estimates.items(): assert type(key) is str - assert len(val) == 15 + assert len(val) == time_horizon for key, val in baseline_estimates.items(): assert type(key) is str - assert len(val) == 15 + assert len(val) == time_horizon if value_memories is not None: - assert len(value_memories) == 15 - assert len(baseline_memories) == 15 + assert len(value_memories) == time_horizon + assert len(baseline_memories) == time_horizon ( value_estimates, From 8a0f0445196203bb60c8ccbf334d268e59e3c9f7 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Thu, 1 Apr 2021 17:47:31 -0400 Subject: [PATCH 09/13] Fix buffer test --- ml-agents/mlagents/trainers/tests/test_buffer.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ml-agents/mlagents/trainers/tests/test_buffer.py b/ml-agents/mlagents/trainers/tests/test_buffer.py index 802b36f725..e831253c57 100644 --- a/ml-agents/mlagents/trainers/tests/test_buffer.py +++ b/ml-agents/mlagents/trainers/tests/test_buffer.py @@ -110,9 +110,6 @@ def test_buffer(): np.array(a), np.array( [ - [0, 0, 0], - [0, 0, 0], - [0, 0, 0], [201, 202, 203], [211, 212, 213], [221, 222, 223], @@ -122,6 +119,9 @@ def test_buffer(): [261, 262, 263], [271, 272, 273], [281, 282, 283], + [0, 0, 0], + [0, 0, 0], + [0, 0, 0], ] ), ) @@ -129,10 +129,10 @@ def test_buffer(): a = agent_2_buffer[BufferKey.GROUP_CONTINUOUS_ACTION].get_batch( batch_size=None, training_length=4, sequential=True ) - for _group_entry in a[:3]: - assert len(_group_entry) == 0 - for _group_entry in a[3:]: + for _group_entry in a[:-3]: assert len(_group_entry) == 3 + for _group_entry in a[-3:]: + assert len(_group_entry) == 0 agent_1_buffer.reset_agent() assert agent_1_buffer.num_experiences == 0 From 5a6f8d720c111e2c78f05ddd30e748b667c5ba43 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Thu, 1 Apr 2021 17:49:49 -0400 Subject: [PATCH 10/13] Increase entropy for Hallway --- config/ppo/Hallway.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config/ppo/Hallway.yaml b/config/ppo/Hallway.yaml index 241bf56774..74d491f57c 100644 --- a/config/ppo/Hallway.yaml +++ b/config/ppo/Hallway.yaml @@ -1,11 +1,11 @@ behaviors: - Hallway: + Hallway2: trainer_type: ppo hyperparameters: batch_size: 128 buffer_size: 1024 learning_rate: 0.0003 - beta: 0.01 + beta: 0.03 epsilon: 0.2 lambd: 0.95 num_epoch: 3 @@ -26,4 +26,4 @@ behaviors: max_steps: 10000000 time_horizon: 64 summary_freq: 10000 - threaded: true + threaded: true \ No newline at end of file From 234f7c80cfde3d51addc9792b0c5086a5674f6e2 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Thu, 1 Apr 2021 17:53:28 -0400 Subject: [PATCH 11/13] Add EOF newline --- config/ppo/Hallway.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/ppo/Hallway.yaml b/config/ppo/Hallway.yaml index 74d491f57c..4d5488a827 100644 --- a/config/ppo/Hallway.yaml +++ b/config/ppo/Hallway.yaml @@ -26,4 +26,4 @@ behaviors: max_steps: 10000000 time_horizon: 64 summary_freq: 10000 - threaded: true \ No newline at end of file + threaded: true From 54b3766e931b447b40b40ec78e9f29619ad3270e Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Thu, 1 Apr 2021 17:53:59 -0400 Subject: [PATCH 12/13] Fix Behavior Name --- config/ppo/Hallway.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/ppo/Hallway.yaml b/config/ppo/Hallway.yaml index 4d5488a827..4ffdd5612f 100644 --- a/config/ppo/Hallway.yaml +++ b/config/ppo/Hallway.yaml @@ -1,5 +1,5 @@ behaviors: - Hallway2: + Hallway: trainer_type: ppo hyperparameters: batch_size: 128 From e2ae9aa426d395ef60a96402747214491d8d2d47 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Mon, 5 Apr 2021 16:51:09 -0400 Subject: [PATCH 13/13] Address comments --- ml-agents/mlagents/trainers/optimizer/torch_optimizer.py | 4 ++-- ml-agents/mlagents/trainers/poca/optimizer_torch.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py index 04c3968081..7f1637164b 100644 --- a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py +++ b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py @@ -76,7 +76,7 @@ def _evaluate_by_sequence( """ num_experiences = tensor_obs[0].shape[0] all_next_memories = AgentBufferField() - # When using LSTM, we need to divide the trajectory into sequences of even length. Sometimes, + # When using LSTM, we need to divide the trajectory into sequences of equal length. Sometimes, # that division isn't even, and we must pad the leftover sequence. # When it is added to the buffer, the last sequence will be padded. So if seq_len = 3 and # trajectory is of length 10, the last sequence is [obs,pad,pad] once it is added to the buffer. @@ -87,7 +87,7 @@ def _evaluate_by_sequence( _mem = initial_memory # Evaluate other trajectories, carrying over _mem after each # trajectory - for seq_num in range(num_experiences // (self.policy.sequence_length)): + for seq_num in range(num_experiences // self.policy.sequence_length): seq_obs = [] for _ in range(self.policy.sequence_length): all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze())) diff --git a/ml-agents/mlagents/trainers/poca/optimizer_torch.py b/ml-agents/mlagents/trainers/poca/optimizer_torch.py index 55cb8ce87e..fd915c2ea9 100644 --- a/ml-agents/mlagents/trainers/poca/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/poca/optimizer_torch.py @@ -382,7 +382,7 @@ def _evaluate_by_sequence_team( all_next_value_mem = AgentBufferField() all_next_baseline_mem = AgentBufferField() - # When using LSTM, we need to divide the trajectory into sequences of even length. Sometimes, + # When using LSTM, we need to divide the trajectory into sequences of equal length. Sometimes, # that division isn't even, and we must pad the leftover sequence. # In the buffer, the last sequence are the ones that are padded. So if seq_len = 3 and # trajectory is of length 10, the last sequence is [obs,pad,pad].