Skip to content

Commit 7cbb832

Browse files
author
Ervin T
authored
[cherry-pick] Fix group rewards for POCA, add warning for non-POCA trainers (#5120)
* Fix end episode for POCA, add warning for group reward if not POCA (#5113) * Fix end episode for POCA, add warning for group reward if not POCA * Add missing imports * Use np.any, which is faster
1 parent 47c5c56 commit 7cbb832

File tree

6 files changed

+95
-12
lines changed

6 files changed

+95
-12
lines changed

ml-agents/mlagents/trainers/poca/trainer.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,15 @@ def _update_policy(self):
245245
self._clear_update_buffer()
246246
return True
247247

248+
def end_episode(self) -> None:
249+
"""
250+
A signal that the Episode has ended. The buffer must be reset.
251+
Get only called when the academy resets. For POCA, we should
252+
also zero out the group rewards.
253+
"""
254+
super().end_episode()
255+
self.collected_group_rewards.clear()
256+
248257
def create_torch_policy(
249258
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
250259
) -> TorchPolicy:

ml-agents/mlagents/trainers/ppo/trainer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
6868
agent_id = trajectory.agent_id # All the agents should have the same ID
6969

7070
agent_buffer_trajectory = trajectory.to_agentbuffer()
71+
# Check if we used group rewards, warn if so.
72+
self._warn_if_group_reward(agent_buffer_trajectory)
73+
7174
# Update the normalization
7275
if self.is_training:
7376
self.policy.update_normalization(agent_buffer_trajectory)

ml-agents/mlagents/trainers/sac/trainer.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,8 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
131131
agent_id = trajectory.agent_id # All the agents should have the same ID
132132

133133
agent_buffer_trajectory = trajectory.to_agentbuffer()
134+
# Check if we used group rewards, warn if so.
135+
self._warn_if_group_reward(agent_buffer_trajectory)
134136

135137
# Update the normalization
136138
if self.is_training:

ml-agents/mlagents/trainers/tests/mock_brain.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ def make_fake_trajectory(
8181
max_step_complete: bool = False,
8282
memory_size: int = 10,
8383
num_other_agents_in_group: int = 0,
84+
group_reward: float = 0.0,
85+
is_terminal: bool = True,
8486
) -> Trajectory:
8587
"""
8688
Makes a fake trajectory of length length. If max_step_complete,
@@ -134,24 +136,29 @@ def make_fake_trajectory(
134136
interrupted=max_step,
135137
memory=memory,
136138
group_status=group_status,
137-
group_reward=0,
139+
group_reward=group_reward,
138140
)
139141
steps_list.append(experience)
140142
obs = []
141143
for obs_spec in observation_specs:
142144
obs.append(np.ones(obs_spec.shape, dtype=np.float32))
145+
last_group_status = []
146+
for _ in range(num_other_agents_in_group):
147+
last_group_status.append(
148+
AgentStatus(obs, reward, action, not max_step_complete and is_terminal)
149+
)
143150
last_experience = AgentExperience(
144151
obs=obs,
145152
reward=reward,
146-
done=not max_step_complete,
153+
done=not max_step_complete and is_terminal,
147154
action=action,
148155
action_probs=action_probs,
149156
action_mask=action_mask,
150157
prev_action=prev_action,
151158
interrupted=max_step_complete,
152159
memory=memory,
153-
group_status=group_status,
154-
group_reward=0,
160+
group_status=last_group_status,
161+
group_reward=group_reward,
155162
)
156163
steps_list.append(last_experience)
157164
return Trajectory(

ml-agents/mlagents/trainers/tests/torch/test_poca.py

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
1+
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
12
import pytest
23

34
import numpy as np
45
import attr
56

7+
# Import to avoid circular import
8+
from mlagents.trainers.trainer.trainer_factory import TrainerFactory # noqa F401
9+
610
from mlagents.trainers.poca.optimizer_torch import TorchPOCAOptimizer
11+
from mlagents.trainers.poca.trainer import POCATrainer
712
from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType
813

914
from mlagents.trainers.policy.torch_policy import TorchPolicy
@@ -12,19 +17,21 @@
1217
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
1318
from mlagents.trainers.settings import NetworkSettings
1419
from mlagents.trainers.tests.dummy_config import ( # noqa: F401
15-
ppo_dummy_config,
20+
create_observation_specs_with_shapes,
21+
poca_dummy_config,
1622
curiosity_dummy_config,
1723
gail_dummy_config,
1824
)
25+
from mlagents.trainers.agent_processor import AgentManagerQueue
26+
from mlagents.trainers.settings import TrainerSettings
1927

20-
from mlagents_envs.base_env import ActionSpec
28+
from mlagents_envs.base_env import ActionSpec, BehaviorSpec
2129
from mlagents.trainers.buffer import BufferKey, RewardSignalUtil
2230

2331

2432
@pytest.fixture
2533
def dummy_config():
26-
# poca has the same hyperparameters as ppo for now
27-
return ppo_dummy_config()
34+
return poca_dummy_config()
2835

2936

3037
VECTOR_ACTION_SPACE = 2
@@ -188,7 +195,7 @@ def test_poca_get_value_estimates(dummy_config, rnn, visual, discrete):
188195
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
189196
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
190197
# We need to test this separately from test_reward_signals.py to ensure no interactions
191-
def test_ppo_optimizer_update_curiosity(
198+
def test_poca_optimizer_update_curiosity(
192199
dummy_config, curiosity_dummy_config, rnn, visual, discrete # noqa: F811
193200
):
194201
# Test evaluate
@@ -230,10 +237,10 @@ def test_ppo_optimizer_update_curiosity(
230237

231238

232239
# We need to test this separately from test_reward_signals.py to ensure no interactions
233-
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811
240+
def test_poca_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811
234241
# Test evaluate
235242
dummy_config.reward_signals = gail_dummy_config
236-
config = ppo_dummy_config()
243+
config = poca_dummy_config()
237244
optimizer = create_test_poca_optimizer(
238245
config, use_rnn=False, use_discrete=False, use_visual=False
239246
)
@@ -286,5 +293,46 @@ def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F8
286293
)
287294

288295

296+
def test_poca_end_episode():
297+
name_behavior_id = "test_trainer"
298+
trainer = POCATrainer(
299+
name_behavior_id,
300+
10,
301+
TrainerSettings(max_steps=100, checkpoint_interval=10, summary_freq=20),
302+
True,
303+
False,
304+
0,
305+
"mock_model_path",
306+
)
307+
behavior_spec = BehaviorSpec(
308+
create_observation_specs_with_shapes([(1,)]), ActionSpec.create_discrete((2,))
309+
)
310+
parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(name_behavior_id)
311+
mock_policy = trainer.create_policy(parsed_behavior_id, behavior_spec)
312+
trainer.add_policy(parsed_behavior_id, mock_policy)
313+
trajectory_queue = AgentManagerQueue("testbrain")
314+
policy_queue = AgentManagerQueue("testbrain")
315+
trainer.subscribe_trajectory_queue(trajectory_queue)
316+
trainer.publish_policy_queue(policy_queue)
317+
time_horizon = 10
318+
trajectory = mb.make_fake_trajectory(
319+
length=time_horizon,
320+
observation_specs=behavior_spec.observation_specs,
321+
max_step_complete=False,
322+
action_spec=behavior_spec.action_spec,
323+
num_other_agents_in_group=2,
324+
group_reward=1.0,
325+
is_terminal=False,
326+
)
327+
trajectory_queue.put(trajectory)
328+
trainer.advance()
329+
# Test that some trajectoories have been injested
330+
for reward in trainer.collected_group_rewards.values():
331+
assert reward == 10
332+
# Test end episode
333+
trainer.end_episode()
334+
assert len(trainer.collected_group_rewards.keys()) == 0
335+
336+
289337
if __name__ == "__main__":
290338
pytest.main()

ml-agents/mlagents/trainers/trainer/rl_trainer.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import abc
55
import time
66
import attr
7+
import numpy as np
78
from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod
89

910
from mlagents.trainers.policy.checkpoint_manager import (
@@ -13,7 +14,7 @@
1314
from mlagents_envs.logging_util import get_logger
1415
from mlagents_envs.timers import timed
1516
from mlagents.trainers.optimizer import Optimizer
16-
from mlagents.trainers.buffer import AgentBuffer
17+
from mlagents.trainers.buffer import AgentBuffer, BufferKey
1718
from mlagents.trainers.trainer import Trainer
1819
from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (
1920
BaseRewardProvider,
@@ -58,6 +59,7 @@ def __init__(self, *args, **kwargs):
5859
self.model_saver = self.create_model_saver(
5960
self.trainer_settings, self.artifact_path, self.load
6061
)
62+
self._has_warned_group_rewards = False
6163

6264
def end_episode(self) -> None:
6365
"""
@@ -256,6 +258,18 @@ def _maybe_save_model(self, step_after_process: int) -> None:
256258
if step_after_process >= self._next_save_step and self.get_step != 0:
257259
self._checkpoint()
258260

261+
def _warn_if_group_reward(self, buffer: AgentBuffer) -> None:
262+
"""
263+
Warn if the trainer receives a Group Reward but isn't a multiagent trainer (e.g. POCA).
264+
"""
265+
if not self._has_warned_group_rewards:
266+
if not np.any(buffer[BufferKey.GROUP_REWARD]):
267+
logger.warning(
268+
"An agent recieved a Group Reward, but you are not using a multi-agent trainer. "
269+
"Please use the POCA trainer for best results."
270+
)
271+
self._has_warned_group_rewards = True
272+
259273
def advance(self) -> None:
260274
"""
261275
Steps the trainer, taking in trajectories and updates if ready.

0 commit comments

Comments
 (0)