Skip to content

Commit efa8f34

Browse files
author
Ervin T
committed
[bug-fix] Fix POCA LSTM, pad sequences in the back (#5206)
* Pad buffer at the end * Fix padding in optimizer value estimate * Fix additional bugs and POCA * Fix groupmate obs, add tests * Update changelog * Improve tests * Address comments * Fix poca test * Fix buffer test * Increase entropy for Hallway * Add EOF newline * Fix Behavior Name * Address comments (cherry picked from commit 2ce6810)
1 parent deebc3d commit efa8f34

File tree

8 files changed

+125
-130
lines changed

8 files changed

+125
-130
lines changed

com.unity.ml-agents/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ and this project adheres to
99
## [1.9.1-preview]
1010
### Bug Fixes
1111
#### ml-agents / ml-agents-envs / gym-unity (Python)
12+
- Fixed an issue which was causing increased variance when using LSTMs. Also fixed an issue with LSTM when used with POCA and `sequence_length` < `time_horizon`. (#5206)
1213
- Fixed a bug where the SAC replay buffer would not be saved out at the end of a run, even if `save_replay_buffer` was enabled. (#5205)
1314

1415
## [1.9.0-preview] - 2021-03-17

config/ppo/Hallway.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ behaviors:
55
batch_size: 128
66
buffer_size: 1024
77
learning_rate: 0.0003
8-
beta: 0.01
8+
beta: 0.03
99
epsilon: 0.2
1010
lambd: 0.95
1111
num_epoch: 3

ml-agents/mlagents/trainers/buffer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def get_batch(
180180
else:
181181
# We want to duplicate the last value in the array, multiplied by the padding_value.
182182
padding = np.array(self[-1], dtype=np.float32) * self.padding_value
183-
return [padding] * (training_length - leftover) + self[:]
183+
return self[:] + [padding] * (training_length - leftover)
184184

185185
else:
186186
return self[len(self) - batch_size * training_length :]

ml-agents/mlagents/trainers/optimizer/torch_optimizer.py

Lines changed: 35 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import Dict, Optional, Tuple, List
22
from mlagents.torch_utils import torch
33
import numpy as np
4-
import math
4+
from collections import defaultdict
55

66
from mlagents.trainers.buffer import AgentBuffer, AgentBufferField
77
from mlagents.trainers.trajectory import ObsUtil
@@ -76,53 +76,52 @@ def _evaluate_by_sequence(
7676
"""
7777
num_experiences = tensor_obs[0].shape[0]
7878
all_next_memories = AgentBufferField()
79-
# In the buffer, the 1st sequence are the ones that are padded. So if seq_len = 3 and
80-
# trajectory is of length 10, the 1st sequence is [pad,pad,obs].
81-
# Compute the number of elements in this padded seq.
82-
leftover = num_experiences % self.policy.sequence_length
83-
84-
# Compute values for the potentially truncated initial sequence
85-
seq_obs = []
86-
87-
first_seq_len = leftover if leftover > 0 else self.policy.sequence_length
88-
for _obs in tensor_obs:
89-
first_seq_obs = _obs[0:first_seq_len]
90-
seq_obs.append(first_seq_obs)
91-
92-
# For the first sequence, the initial memory should be the one at the
93-
# beginning of this trajectory.
94-
for _ in range(first_seq_len):
95-
all_next_memories.append(ModelUtils.to_numpy(initial_memory.squeeze()))
96-
97-
init_values, _mem = self.critic.critic_pass(
98-
seq_obs, initial_memory, sequence_length=first_seq_len
99-
)
100-
all_values = {
101-
signal_name: [init_values[signal_name]]
102-
for signal_name in init_values.keys()
103-
}
104-
79+
# When using LSTM, we need to divide the trajectory into sequences of equal length. Sometimes,
80+
# that division isn't even, and we must pad the leftover sequence.
81+
# When it is added to the buffer, the last sequence will be padded. So if seq_len = 3 and
82+
# trajectory is of length 10, the last sequence is [obs,pad,pad] once it is added to the buffer.
83+
# Compute the number of elements in this sequence that will end up being padded.
84+
leftover_seq_len = num_experiences % self.policy.sequence_length
85+
86+
all_values: Dict[str, List[np.ndarray]] = defaultdict(list)
87+
_mem = initial_memory
10588
# Evaluate other trajectories, carrying over _mem after each
10689
# trajectory
107-
for seq_num in range(
108-
1, math.ceil((num_experiences) / (self.policy.sequence_length))
109-
):
90+
for seq_num in range(num_experiences // self.policy.sequence_length):
11091
seq_obs = []
11192
for _ in range(self.policy.sequence_length):
11293
all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze()))
113-
start = seq_num * self.policy.sequence_length - (
114-
self.policy.sequence_length - leftover
115-
)
116-
end = (seq_num + 1) * self.policy.sequence_length - (
117-
self.policy.sequence_length - leftover
118-
)
94+
start = seq_num * self.policy.sequence_length
95+
end = (seq_num + 1) * self.policy.sequence_length
96+
11997
for _obs in tensor_obs:
12098
seq_obs.append(_obs[start:end])
12199
values, _mem = self.critic.critic_pass(
122100
seq_obs, _mem, sequence_length=self.policy.sequence_length
123101
)
124102
for signal_name, _val in values.items():
125103
all_values[signal_name].append(_val)
104+
105+
# Compute values for the potentially truncated last sequence. Note that this
106+
# sequence isn't padded yet, but will be.
107+
seq_obs = []
108+
109+
if leftover_seq_len > 0:
110+
for _obs in tensor_obs:
111+
last_seq_obs = _obs[-leftover_seq_len:]
112+
seq_obs.append(last_seq_obs)
113+
114+
# For the last sequence, the initial memory should be the one at the
115+
# end of this trajectory.
116+
for _ in range(leftover_seq_len):
117+
all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze()))
118+
119+
last_values, _mem = self.critic.critic_pass(
120+
seq_obs, _mem, sequence_length=leftover_seq_len
121+
)
122+
for signal_name, _val in last_values.items():
123+
all_values[signal_name].append(_val)
124+
126125
# Create one tensor per reward signal
127126
all_value_tensors = {
128127
signal_name: torch.cat(value_list, dim=0)

ml-agents/mlagents/trainers/poca/optimizer_torch.py

Lines changed: 70 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from typing import Dict, cast, List, Tuple, Optional
2+
from collections import defaultdict
23
from mlagents.trainers.torch.components.reward_providers.extrinsic_reward_provider import (
34
ExtrinsicRewardProvider,
45
)
56
import numpy as np
6-
import math
77
from mlagents.torch_utils import torch, default_device
88

99
from mlagents.trainers.buffer import (
@@ -381,116 +381,109 @@ def _evaluate_by_sequence_team(
381381
num_experiences = self_obs[0].shape[0]
382382
all_next_value_mem = AgentBufferField()
383383
all_next_baseline_mem = AgentBufferField()
384-
# In the buffer, the 1st sequence are the ones that are padded. So if seq_len = 3 and
385-
# trajectory is of length 10, the 1st sequence is [pad,pad,obs].
386-
# Compute the number of elements in this padded seq.
387-
leftover = num_experiences % self.policy.sequence_length
388-
389-
# Compute values for the potentially truncated initial sequence
390384

391-
first_seq_len = leftover if leftover > 0 else self.policy.sequence_length
392-
393-
self_seq_obs = []
394-
groupmate_seq_obs = []
395-
groupmate_seq_act = []
396-
seq_obs = []
397-
for _self_obs in self_obs:
398-
first_seq_obs = _self_obs[0:first_seq_len]
399-
seq_obs.append(first_seq_obs)
400-
self_seq_obs.append(seq_obs)
401-
402-
for groupmate_obs, groupmate_action in zip(obs, actions):
403-
seq_obs = []
404-
for _obs in groupmate_obs:
405-
first_seq_obs = _obs[0:first_seq_len]
406-
seq_obs.append(first_seq_obs)
407-
groupmate_seq_obs.append(seq_obs)
408-
_act = groupmate_action.slice(0, first_seq_len)
409-
groupmate_seq_act.append(_act)
410-
411-
# For the first sequence, the initial memory should be the one at the
412-
# beginning of this trajectory.
413-
for _ in range(first_seq_len):
414-
all_next_value_mem.append(ModelUtils.to_numpy(init_value_mem.squeeze()))
415-
all_next_baseline_mem.append(
416-
ModelUtils.to_numpy(init_baseline_mem.squeeze())
417-
)
418-
419-
all_seq_obs = self_seq_obs + groupmate_seq_obs
420-
init_values, _value_mem = self.critic.critic_pass(
421-
all_seq_obs, init_value_mem, sequence_length=first_seq_len
422-
)
423-
all_values = {
424-
signal_name: [init_values[signal_name]]
425-
for signal_name in init_values.keys()
426-
}
385+
# When using LSTM, we need to divide the trajectory into sequences of equal length. Sometimes,
386+
# that division isn't even, and we must pad the leftover sequence.
387+
# In the buffer, the last sequence are the ones that are padded. So if seq_len = 3 and
388+
# trajectory is of length 10, the last sequence is [obs,pad,pad].
389+
# Compute the number of elements in this padded seq.
390+
leftover_seq_len = num_experiences % self.policy.sequence_length
427391

428-
groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act)
429-
init_baseline, _baseline_mem = self.critic.baseline(
430-
self_seq_obs[0],
431-
groupmate_obs_and_actions,
432-
init_baseline_mem,
433-
sequence_length=first_seq_len,
434-
)
435-
all_baseline = {
436-
signal_name: [init_baseline[signal_name]]
437-
for signal_name in init_baseline.keys()
438-
}
392+
all_values: Dict[str, List[np.ndarray]] = defaultdict(list)
393+
all_baseline: Dict[str, List[np.ndarray]] = defaultdict(list)
394+
_baseline_mem = init_baseline_mem
395+
_value_mem = init_value_mem
439396

440397
# Evaluate other trajectories, carrying over _mem after each
441398
# trajectory
442-
for seq_num in range(
443-
1, math.ceil((num_experiences) / (self.policy.sequence_length))
444-
):
399+
for seq_num in range(num_experiences // self.policy.sequence_length):
445400
for _ in range(self.policy.sequence_length):
446401
all_next_value_mem.append(ModelUtils.to_numpy(_value_mem.squeeze()))
447402
all_next_baseline_mem.append(
448403
ModelUtils.to_numpy(_baseline_mem.squeeze())
449404
)
450405

451-
start = seq_num * self.policy.sequence_length - (
452-
self.policy.sequence_length - leftover
453-
)
454-
end = (seq_num + 1) * self.policy.sequence_length - (
455-
self.policy.sequence_length - leftover
456-
)
406+
start = seq_num * self.policy.sequence_length
407+
end = (seq_num + 1) * self.policy.sequence_length
457408

458409
self_seq_obs = []
459410
groupmate_seq_obs = []
460411
groupmate_seq_act = []
461412
seq_obs = []
462413
for _self_obs in self_obs:
463-
seq_obs.append(_obs[start:end])
414+
seq_obs.append(_self_obs[start:end])
464415
self_seq_obs.append(seq_obs)
465416

466-
for groupmate_obs, team_action in zip(obs, actions):
417+
for groupmate_obs, groupmate_action in zip(obs, actions):
467418
seq_obs = []
468-
for (_obs,) in groupmate_obs:
469-
first_seq_obs = _obs[start:end]
470-
seq_obs.append(first_seq_obs)
419+
for _obs in groupmate_obs:
420+
sliced_seq_obs = _obs[start:end]
421+
seq_obs.append(sliced_seq_obs)
471422
groupmate_seq_obs.append(seq_obs)
472-
_act = team_action.slice(start, end)
423+
_act = groupmate_action.slice(start, end)
473424
groupmate_seq_act.append(_act)
474425

475426
all_seq_obs = self_seq_obs + groupmate_seq_obs
476427
values, _value_mem = self.critic.critic_pass(
477428
all_seq_obs, _value_mem, sequence_length=self.policy.sequence_length
478429
)
479-
all_values = {
480-
signal_name: [init_values[signal_name]] for signal_name in values.keys()
481-
}
430+
for signal_name, _val in values.items():
431+
all_values[signal_name].append(_val)
482432

483433
groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act)
484434
baselines, _baseline_mem = self.critic.baseline(
485435
self_seq_obs[0],
486436
groupmate_obs_and_actions,
487437
_baseline_mem,
488-
sequence_length=first_seq_len,
438+
sequence_length=self.policy.sequence_length,
439+
)
440+
for signal_name, _val in baselines.items():
441+
all_baseline[signal_name].append(_val)
442+
443+
# Compute values for the potentially truncated initial sequence
444+
if leftover_seq_len > 0:
445+
self_seq_obs = []
446+
groupmate_seq_obs = []
447+
groupmate_seq_act = []
448+
seq_obs = []
449+
for _self_obs in self_obs:
450+
last_seq_obs = _self_obs[-leftover_seq_len:]
451+
seq_obs.append(last_seq_obs)
452+
self_seq_obs.append(seq_obs)
453+
454+
for groupmate_obs, groupmate_action in zip(obs, actions):
455+
seq_obs = []
456+
for _obs in groupmate_obs:
457+
last_seq_obs = _obs[-leftover_seq_len:]
458+
seq_obs.append(last_seq_obs)
459+
groupmate_seq_obs.append(seq_obs)
460+
_act = groupmate_action.slice(len(_obs) - leftover_seq_len, len(_obs))
461+
groupmate_seq_act.append(_act)
462+
463+
# For the last sequence, the initial memory should be the one at the
464+
# beginning of this trajectory.
465+
seq_obs = []
466+
for _ in range(leftover_seq_len):
467+
all_next_value_mem.append(ModelUtils.to_numpy(_value_mem.squeeze()))
468+
all_next_baseline_mem.append(
469+
ModelUtils.to_numpy(_baseline_mem.squeeze())
470+
)
471+
472+
all_seq_obs = self_seq_obs + groupmate_seq_obs
473+
last_values, _value_mem = self.critic.critic_pass(
474+
all_seq_obs, _value_mem, sequence_length=leftover_seq_len
475+
)
476+
for signal_name, _val in last_values.items():
477+
all_values[signal_name].append(_val)
478+
groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act)
479+
last_baseline, _baseline_mem = self.critic.baseline(
480+
self_seq_obs[0],
481+
groupmate_obs_and_actions,
482+
_baseline_mem,
483+
sequence_length=leftover_seq_len,
489484
)
490-
all_baseline = {
491-
signal_name: [baselines[signal_name]]
492-
for signal_name in baselines.keys()
493-
}
485+
for signal_name, _val in last_baseline.items():
486+
all_baseline[signal_name].append(_val)
494487
# Create one tensor per reward signal
495488
all_value_tensors = {
496489
signal_name: torch.cat(value_list, dim=0)

ml-agents/mlagents/trainers/tests/test_buffer.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,6 @@ def test_buffer():
110110
np.array(a),
111111
np.array(
112112
[
113-
[0, 0, 0],
114-
[0, 0, 0],
115-
[0, 0, 0],
116113
[201, 202, 203],
117114
[211, 212, 213],
118115
[221, 222, 223],
@@ -122,17 +119,20 @@ def test_buffer():
122119
[261, 262, 263],
123120
[271, 272, 273],
124121
[281, 282, 283],
122+
[0, 0, 0],
123+
[0, 0, 0],
124+
[0, 0, 0],
125125
]
126126
),
127127
)
128128
# Test group entries return Lists of Lists. Make sure to pad properly!
129129
a = agent_2_buffer[BufferKey.GROUP_CONTINUOUS_ACTION].get_batch(
130130
batch_size=None, training_length=4, sequential=True
131131
)
132-
for _group_entry in a[:3]:
133-
assert len(_group_entry) == 0
134-
for _group_entry in a[3:]:
132+
for _group_entry in a[:-3]:
135133
assert len(_group_entry) == 3
134+
for _group_entry in a[-3:]:
135+
assert len(_group_entry) == 0
136136

137137
agent_1_buffer.reset_agent()
138138
assert agent_1_buffer.num_experiences == 0

ml-agents/mlagents/trainers/tests/torch/test_poca.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def create_test_poca_optimizer(dummy_config, use_rnn, use_discrete, use_visual):
6060
}
6161

6262
trainer_settings.network_settings.memory = (
63-
NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
63+
NetworkSettings.MemorySettings(sequence_length=8, memory_size=10)
6464
if use_rnn
6565
else None
6666
)
@@ -125,7 +125,7 @@ def test_poca_get_value_estimates(dummy_config, rnn, visual, discrete):
125125
optimizer = create_test_poca_optimizer(
126126
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
127127
)
128-
time_horizon = 15
128+
time_horizon = 30
129129
trajectory = make_fake_trajectory(
130130
length=time_horizon,
131131
observation_specs=optimizer.policy.behavior_spec.observation_specs,
@@ -147,14 +147,14 @@ def test_poca_get_value_estimates(dummy_config, rnn, visual, discrete):
147147
)
148148
for key, val in value_estimates.items():
149149
assert type(key) is str
150-
assert len(val) == 15
150+
assert len(val) == time_horizon
151151
for key, val in baseline_estimates.items():
152152
assert type(key) is str
153-
assert len(val) == 15
153+
assert len(val) == time_horizon
154154

155155
if value_memories is not None:
156-
assert len(value_memories) == 15
157-
assert len(baseline_memories) == 15
156+
assert len(value_memories) == time_horizon
157+
assert len(baseline_memories) == time_horizon
158158

159159
(
160160
value_estimates,

0 commit comments

Comments
 (0)