From 7e7c3e2ff911358d71165946830a0cfbd6e6da09 Mon Sep 17 00:00:00 2001
From: Chingiz Mardanov <chingiz.mardanov@unity3d.com>
Date: Tue, 26 Oct 2021 17:37:22 -0400
Subject: [PATCH 01/15] Progress on propagating the setting to the action
 model.

---
 ml-agents/mlagents/trainers/cli_utils.py          | 7 +++++++
 ml-agents/mlagents/trainers/settings.py           | 9 +++++++++
 ml-agents/mlagents/trainers/torch/action_model.py | 2 ++
 ml-agents/mlagents/trainers/torch/networks.py     | 1 +
 4 files changed, 19 insertions(+)

diff --git a/ml-agents/mlagents/trainers/cli_utils.py b/ml-agents/mlagents/trainers/cli_utils.py
index 5884c3a5c5..ca7fd1e02e 100644
--- a/ml-agents/mlagents/trainers/cli_utils.py
+++ b/ml-agents/mlagents/trainers/cli_utils.py
@@ -91,6 +91,13 @@ def _create_parser() -> argparse.ArgumentParser:
         "before resuming training. This option is only valid when the models exist, and have the same "
         "behavior names as the current agents in your scene.",
     )
+    argparser.add_argument(
+        "--deterministic",
+        default=False,
+        dest="deterministic",
+        action=DetectDefaultStoreTrue,
+        help="Whether to use the deterministic samples from the data.",
+    )
     argparser.add_argument(
         "--force",
         default=False,
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index fe52fb838c..24a46a7120 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -151,6 +151,7 @@ def _check_valid_memory_size(self, attribute, value):
     vis_encode_type: EncoderType = EncoderType.SIMPLE
     memory: Optional[MemorySettings] = None
     goal_conditioning_type: ConditioningType = ConditioningType.HYPER
+    deterministic: bool = parser.get_default("deterministic")
 
 
 @attr.s(auto_attribs=True)
@@ -928,6 +929,7 @@ def from_argparse(args: argparse.Namespace) -> "RunOptions":
                         key
                     )
                 )
+
         # Override with CLI args
         # Keep deprecated --load working, TODO: remove
         argparse_args["resume"] = argparse_args["resume"] or argparse_args["load_model"]
@@ -950,6 +952,13 @@ def from_argparse(args: argparse.Namespace) -> "RunOptions":
         if isinstance(final_runoptions.behaviors, TrainerSettings.DefaultTrainerDict):
             # configure whether or not we should require all behavior names to be found in the config YAML
             final_runoptions.behaviors.set_config_specified(_require_all_behaviors)
+
+        for behaviour in final_runoptions.behaviors.keys():
+            if not final_runoptions.behaviors[behaviour].network_settings.deterministic:
+                final_runoptions.behaviors[
+                    behaviour
+                ].network_settings.deterministic = argparse_args["deterministic"]
+
         return final_runoptions
 
     @staticmethod
diff --git a/ml-agents/mlagents/trainers/torch/action_model.py b/ml-agents/mlagents/trainers/torch/action_model.py
index c5de586e4d..28fc161edd 100644
--- a/ml-agents/mlagents/trainers/torch/action_model.py
+++ b/ml-agents/mlagents/trainers/torch/action_model.py
@@ -32,6 +32,7 @@ def __init__(
         action_spec: ActionSpec,
         conditional_sigma: bool = False,
         tanh_squash: bool = False,
+        deterministic: bool = False,
     ):
         """
         A torch module that represents the action space of a policy. The ActionModel may contain
@@ -66,6 +67,7 @@ def __init__(
         # During training, clipping is done in TorchPolicy, but we need to clip before ONNX
         # export as well.
         self._clip_action_on_export = not tanh_squash
+        self.deterministic = deterministic
 
     def _sample_action(self, dists: DistInstances) -> AgentAction:
         """
diff --git a/ml-agents/mlagents/trainers/torch/networks.py b/ml-agents/mlagents/trainers/torch/networks.py
index 4a2e1dafc6..19b97e2860 100644
--- a/ml-agents/mlagents/trainers/torch/networks.py
+++ b/ml-agents/mlagents/trainers/torch/networks.py
@@ -617,6 +617,7 @@ def __init__(
             action_spec,
             conditional_sigma=conditional_sigma,
             tanh_squash=tanh_squash,
+            deterministic=network_settings.deterministic,
         )
 
     @property

From 824f54b67065ae4393af60c412700d46beb754c5 Mon Sep 17 00:00:00 2001
From: Chingiz Mardanov <chingiz.mardanov@unity3d.com>
Date: Wed, 27 Oct 2021 11:18:28 -0400
Subject: [PATCH 02/15] Added the _sample_action logic and tests.

---
 .../mlagents/trainers/tests/test_settings.py  |  2 ++
 .../trainers/tests/torch/test_action_model.py | 29 +++++++++++++++++--
 .../mlagents/trainers/torch/action_model.py   | 14 +++++++--
 .../mlagents/trainers/torch/distributions.py  | 13 +++++++++
 4 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/ml-agents/mlagents/trainers/tests/test_settings.py b/ml-agents/mlagents/trainers/tests/test_settings.py
index 5fe453e43b..e53d0d566d 100644
--- a/ml-agents/mlagents/trainers/tests/test_settings.py
+++ b/ml-agents/mlagents/trainers/tests/test_settings.py
@@ -389,6 +389,7 @@ def test_exportable_settings(use_defaults):
                 init_entcoef: 0.5
                 reward_signal_steps_per_update: 10.0
             network_settings:
+                deterministic: true
                 normalize: false
                 hidden_units: 256
                 num_layers: 3
@@ -541,6 +542,7 @@ def test_default_settings():
     test1_settings = run_options.behaviors["test1"]
     assert test1_settings.max_steps == 2
     assert test1_settings.network_settings.hidden_units == 2000
+    assert not test1_settings.network_settings.deterministic
     assert test1_settings.network_settings.num_layers == 1000
     # Change the overridden fields back, and check if the rest are equal.
     test1_settings.max_steps = 1
diff --git a/ml-agents/mlagents/trainers/tests/torch/test_action_model.py b/ml-agents/mlagents/trainers/tests/torch/test_action_model.py
index 9722931446..1365c5fdba 100644
--- a/ml-agents/mlagents/trainers/tests/torch/test_action_model.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_action_model.py
@@ -11,10 +11,10 @@
 from mlagents_envs.base_env import ActionSpec
 
 
-def create_action_model(inp_size, act_size):
+def create_action_model(inp_size, act_size, deterministic=False):
     mask = torch.ones([1, act_size * 2])
     action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size)))
-    action_model = ActionModel(inp_size, action_spec)
+    action_model = ActionModel(inp_size, action_spec, deterministic=deterministic)
     return action_model, mask
 
 
@@ -43,6 +43,31 @@ def test_sample_action():
         assert _disc.shape == (1, 1)
 
 
+def test_deterministic_sample_action():
+    inp_size = 4
+    act_size = 2
+    action_model, masks = create_action_model(inp_size, act_size, deterministic=True)
+    sample_inp = torch.ones((1, inp_size))
+    dists = action_model._get_dists(sample_inp, masks=masks)
+    agent_action1 = action_model._sample_action(dists)
+    agent_action2 = action_model._sample_action(dists)
+    agent_action3 = action_model._sample_action(dists)
+    assert torch.equal(agent_action1.continuous_tensor, agent_action2.continuous_tensor)
+    assert torch.equal(agent_action1.continuous_tensor, agent_action3.continuous_tensor)
+    action_model, masks = create_action_model(inp_size, act_size, deterministic=False)
+    sample_inp = torch.ones((1, inp_size))
+    dists = action_model._get_dists(sample_inp, masks=masks)
+    agent_action1 = action_model._sample_action(dists)
+    agent_action2 = action_model._sample_action(dists)
+    agent_action3 = action_model._sample_action(dists)
+    assert not torch.equal(
+        agent_action1.continuous_tensor, agent_action2.continuous_tensor
+    )
+    assert not torch.equal(
+        agent_action1.continuous_tensor, agent_action3.continuous_tensor
+    )
+
+
 def test_get_probs_and_entropy():
     inp_size = 4
     act_size = 2
diff --git a/ml-agents/mlagents/trainers/torch/action_model.py b/ml-agents/mlagents/trainers/torch/action_model.py
index 28fc161edd..8d18416f68 100644
--- a/ml-agents/mlagents/trainers/torch/action_model.py
+++ b/ml-agents/mlagents/trainers/torch/action_model.py
@@ -75,15 +75,23 @@ def _sample_action(self, dists: DistInstances) -> AgentAction:
         :params dists: The DistInstances tuple
         :return: An AgentAction corresponding to the actions sampled from the DistInstances
         """
+
         continuous_action: Optional[torch.Tensor] = None
         discrete_action: Optional[List[torch.Tensor]] = None
         # This checks None because mypy complains otherwise
         if dists.continuous is not None:
-            continuous_action = dists.continuous.sample()
+            if self.deterministic:
+                continuous_action = dists.continuous.deterministic_sample()
+            else:
+                continuous_action = dists.continuous.sample()
         if dists.discrete is not None:
             discrete_action = []
-            for discrete_dist in dists.discrete:
-                discrete_action.append(discrete_dist.sample())
+            if self.deterministic:
+                for discrete_dist in dists.discrete:
+                    discrete_action.append(discrete_dist.deterministic_sample())
+            else:
+                for discrete_dist in dists.discrete:
+                    discrete_action.append(discrete_dist.sample())
         return AgentAction(continuous_action, discrete_action)
 
     def _get_dists(self, inputs: torch.Tensor, masks: torch.Tensor) -> DistInstances:
diff --git a/ml-agents/mlagents/trainers/torch/distributions.py b/ml-agents/mlagents/trainers/torch/distributions.py
index 1f5960d10b..00aee3aaa0 100644
--- a/ml-agents/mlagents/trainers/torch/distributions.py
+++ b/ml-agents/mlagents/trainers/torch/distributions.py
@@ -16,6 +16,13 @@ def sample(self) -> torch.Tensor:
         """
         pass
 
+    @abc.abstractmethod
+    def deterministic_sample(self) -> torch.Tensor:
+        """
+        Return the most probable sample from this distribution.
+        """
+        pass
+
     @abc.abstractmethod
     def log_prob(self, value: torch.Tensor) -> torch.Tensor:
         """
@@ -59,6 +66,9 @@ def sample(self):
         sample = self.mean + torch.randn_like(self.mean) * self.std
         return sample
 
+    def deterministic_sample(self):
+        return self.mean
+
     def log_prob(self, value):
         var = self.std ** 2
         log_scale = torch.log(self.std + EPSILON)
@@ -113,6 +123,9 @@ def __init__(self, logits):
     def sample(self):
         return torch.multinomial(self.probs, 1)
 
+    def deterministic_sample(self):
+        return torch.argmax(self.probs).reshape((1, 1))
+
     def pdf(self, value):
         # This function is equivalent to torch.diag(self.probs.T[value.flatten().long()]),
         # but torch.diag is not supported by ONNX export.

From 3e1a60a2ab69edb6c53645e81146a1fb7086d5b4 Mon Sep 17 00:00:00 2001
From: Chingiz Mardanov <chingiz.mardanov@unity3d.com>
Date: Wed, 27 Oct 2021 13:32:59 -0400
Subject: [PATCH 03/15] Add information to the changelog.

---
 com.unity.ml-agents/CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md
index dab7cb200f..c0422cc3a6 100755
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
@@ -28,6 +28,9 @@ and this project adheres to
     1. env_params.max_lifetime_restarts (--max-lifetime-restarts) [default=10]
     2. env_params.restarts_rate_limit_n (--restarts-rate-limit-n) [default=1]
     3. env_params.restarts_rate_limit_period_s (--restarts-rate-limit-period-s) [default=60]
+
+- Added a new `--deterministic` flag to make sure that actions are selected in a predictable deterministic manner. Same can
+be achieved by setting a `deterministic: true` in the `network_settings` of the run options configuration.
 ### Bug Fixes
 - Fixed the bug where curriculum learning would crash because of the incorrect run_options parsing. (#5586)
 

From 2918be6bea65d630ef2a002223e8a68f899de1b1 Mon Sep 17 00:00:00 2001
From: Chingiz Mardanov <chingiz.mardanov@unity3d.com>
Date: Wed, 27 Oct 2021 17:17:35 -0400
Subject: [PATCH 04/15] Prioritize the CLI over the configuration file.

---
 ml-agents/mlagents/trainers/settings.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 24a46a7120..82a917900a 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -933,6 +933,7 @@ def from_argparse(args: argparse.Namespace) -> "RunOptions":
         # Override with CLI args
         # Keep deprecated --load working, TODO: remove
         argparse_args["resume"] = argparse_args["resume"] or argparse_args["load_model"]
+
         for key, val in argparse_args.items():
             if key in DetectDefault.non_default_args:
                 if key in attr.fields_dict(CheckpointSettings):
@@ -953,8 +954,11 @@ def from_argparse(args: argparse.Namespace) -> "RunOptions":
             # configure whether or not we should require all behavior names to be found in the config YAML
             final_runoptions.behaviors.set_config_specified(_require_all_behaviors)
 
-        for behaviour in final_runoptions.behaviors.keys():
-            if not final_runoptions.behaviors[behaviour].network_settings.deterministic:
+        _non_default_args = DetectDefault.non_default_args
+
+        # Prioritize the deterministic mode form the cli for deterministic actions.
+        if "deterministic" in _non_default_args:
+            for behaviour in final_runoptions.behaviors.keys():
                 final_runoptions.behaviors[
                     behaviour
                 ].network_settings.deterministic = argparse_args["deterministic"]

From f1d09652db04ba2ada73b119c2eec95c9e5a6124 Mon Sep 17 00:00:00 2001
From: Chingiz Mardanov <chingiz.mardanov@unity3d.com>
Date: Thu, 28 Oct 2021 13:54:52 -0400
Subject: [PATCH 05/15] Update documentation for config file.

---
 docs/Training-Configuration-File.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/Training-Configuration-File.md b/docs/Training-Configuration-File.md
index 537bea2f3e..669e050b3a 100644
--- a/docs/Training-Configuration-File.md
+++ b/docs/Training-Configuration-File.md
@@ -44,6 +44,7 @@ choice of the trainer (which we review on subsequent sections).
 | `network_settings -> normalize`              | (default = `false`) Whether normalization is applied to the vector observation inputs. This normalization is based on the running average and variance of the vector observation. Normalization can be helpful in cases with complex continuous control problems, but may be harmful with simpler discrete control problems.                                                                                                                                                                                                                                                                                                                                                                                                                       |
 | `network_settings -> vis_encode_type`       | (default = `simple`) Encoder type for encoding visual observations. <br><br> `simple` (default) uses a simple encoder which consists of two convolutional layers, `nature_cnn` uses the CNN implementation proposed by [Mnih et al.](https://www.nature.com/articles/nature14236), consisting of three convolutional layers, and `resnet` uses the [IMPALA Resnet](https://arxiv.org/abs/1802.01561) consisting of three stacked layers, each with two residual blocks, making a much larger network than the other two. `match3` is a smaller CNN ([Gudmundsoon et al.](https://www.researchgate.net/publication/328307928_Human-Like_Playtesting_with_Deep_Learning)) that can capture more granular spatial relationships and is optimized for board games. `fully_connected` uses a single fully connected dense layer as encoder without any convolutional layers. <br><br> Due to the size of convolution kernel, there is a minimum observation size limitation that each encoder type can handle - `simple`: 20x20, `nature_cnn`: 36x36, `resnet`: 15 x 15, `match3`: 5x5.  `fully_connected` doesn't have convolutional layers and thus no size limits, but since it has less representation power it should be reserved for very small inputs. Note that using the `match3` CNN with very large visual input might result in a huge observation encoding and thus potentially slow down training or cause memory issues.                                                                                                                                                                                              |
 | `network_settings -> conditioning_type`       | (default = `hyper`) Conditioning type for the policy using goal observations. <br><br> `none` treats the goal observations as regular observations, `hyper` (default) uses a HyperNetwork with goal observations as input to generate some of the weights of the policy. Note that when using `hyper` the number of parameters of the network increases greatly. Therefore, it is recommended to reduce the number of `hidden_units` when using this `conditioning_type`
+| `network_settings -> deterministic`       | (default = `false`) Ensures that actions are selected from the models output deterministically to ensure predictable and reproducible results. This can be overwritten by the `--deterministic` flag on the CLI.
 
 
 ## Trainer-specific Configurations

From 646498e058e474238053c6dae5a660d12405e918 Mon Sep 17 00:00:00 2001
From: Chingiz Mardanov <chingiz.mardanov@unity3d.com>
Date: Fri, 29 Oct 2021 13:10:56 -0400
Subject: [PATCH 06/15] CR refactor.

---
 ml-agents/mlagents/trainers/torch/action_model.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ml-agents/mlagents/trainers/torch/action_model.py b/ml-agents/mlagents/trainers/torch/action_model.py
index 8d18416f68..4928fa80a1 100644
--- a/ml-agents/mlagents/trainers/torch/action_model.py
+++ b/ml-agents/mlagents/trainers/torch/action_model.py
@@ -67,7 +67,7 @@ def __init__(
         # During training, clipping is done in TorchPolicy, but we need to clip before ONNX
         # export as well.
         self._clip_action_on_export = not tanh_squash
-        self.deterministic = deterministic
+        self._deterministic = deterministic
 
     def _sample_action(self, dists: DistInstances) -> AgentAction:
         """
@@ -80,13 +80,13 @@ def _sample_action(self, dists: DistInstances) -> AgentAction:
         discrete_action: Optional[List[torch.Tensor]] = None
         # This checks None because mypy complains otherwise
         if dists.continuous is not None:
-            if self.deterministic:
+            if self._deterministic:
                 continuous_action = dists.continuous.deterministic_sample()
             else:
                 continuous_action = dists.continuous.sample()
         if dists.discrete is not None:
             discrete_action = []
-            if self.deterministic:
+            if self._deterministic:
                 for discrete_dist in dists.discrete:
                     discrete_action.append(discrete_dist.deterministic_sample())
             else:

From 78fd1c80a6e3d28835dae9bdfeb4c7e018966ae5 Mon Sep 17 00:00:00 2001
From: cmard <87716492+cmard@users.noreply.github.com>
Date: Fri, 29 Oct 2021 13:12:09 -0400
Subject: [PATCH 07/15] Update docs/Training-Configuration-File.md

Co-authored-by: Miguel Alonso Jr. <76960110+miguelalonsojr@users.noreply.github.com>
---
 docs/Training-Configuration-File.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Training-Configuration-File.md b/docs/Training-Configuration-File.md
index 669e050b3a..a3e6cc35fb 100644
--- a/docs/Training-Configuration-File.md
+++ b/docs/Training-Configuration-File.md
@@ -44,7 +44,7 @@ choice of the trainer (which we review on subsequent sections).
 | `network_settings -> normalize`              | (default = `false`) Whether normalization is applied to the vector observation inputs. This normalization is based on the running average and variance of the vector observation. Normalization can be helpful in cases with complex continuous control problems, but may be harmful with simpler discrete control problems.                                                                                                                                                                                                                                                                                                                                                                                                                       |
 | `network_settings -> vis_encode_type`       | (default = `simple`) Encoder type for encoding visual observations. <br><br> `simple` (default) uses a simple encoder which consists of two convolutional layers, `nature_cnn` uses the CNN implementation proposed by [Mnih et al.](https://www.nature.com/articles/nature14236), consisting of three convolutional layers, and `resnet` uses the [IMPALA Resnet](https://arxiv.org/abs/1802.01561) consisting of three stacked layers, each with two residual blocks, making a much larger network than the other two. `match3` is a smaller CNN ([Gudmundsoon et al.](https://www.researchgate.net/publication/328307928_Human-Like_Playtesting_with_Deep_Learning)) that can capture more granular spatial relationships and is optimized for board games. `fully_connected` uses a single fully connected dense layer as encoder without any convolutional layers. <br><br> Due to the size of convolution kernel, there is a minimum observation size limitation that each encoder type can handle - `simple`: 20x20, `nature_cnn`: 36x36, `resnet`: 15 x 15, `match3`: 5x5.  `fully_connected` doesn't have convolutional layers and thus no size limits, but since it has less representation power it should be reserved for very small inputs. Note that using the `match3` CNN with very large visual input might result in a huge observation encoding and thus potentially slow down training or cause memory issues.                                                                                                                                                                                              |
 | `network_settings -> conditioning_type`       | (default = `hyper`) Conditioning type for the policy using goal observations. <br><br> `none` treats the goal observations as regular observations, `hyper` (default) uses a HyperNetwork with goal observations as input to generate some of the weights of the policy. Note that when using `hyper` the number of parameters of the network increases greatly. Therefore, it is recommended to reduce the number of `hidden_units` when using this `conditioning_type`
-| `network_settings -> deterministic`       | (default = `false`) Ensures that actions are selected from the models output deterministically to ensure predictable and reproducible results. This can be overwritten by the `--deterministic` flag on the CLI.
+| `network_settings -> deterministic`       | (default = `false`) When set to true, ensures that actions are selected from the models output deterministically to ensure predictable and reproducible results. This can be overwritten by the `--deterministic` flag on the CLI.
 
 
 ## Trainer-specific Configurations

From 6e434513fb5e532a2f3ce8fd3011219b8a01eec0 Mon Sep 17 00:00:00 2001
From: cmard <87716492+cmard@users.noreply.github.com>
Date: Fri, 29 Oct 2021 13:12:16 -0400
Subject: [PATCH 08/15] Update com.unity.ml-agents/CHANGELOG.md

Co-authored-by: Miguel Alonso Jr. <76960110+miguelalonsojr@users.noreply.github.com>
---
 com.unity.ml-agents/CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md
index c0422cc3a6..ea17c0e106 100755
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
@@ -30,7 +30,7 @@ and this project adheres to
     3. env_params.restarts_rate_limit_period_s (--restarts-rate-limit-period-s) [default=60]
 
 - Added a new `--deterministic` flag to make sure that actions are selected in a predictable deterministic manner. Same can
-be achieved by setting a `deterministic: true` in the `network_settings` of the run options configuration.
+be achieved by adding `deterministic: true` under `network_settings` of the run options configuration.
 ### Bug Fixes
 - Fixed the bug where curriculum learning would crash because of the incorrect run_options parsing. (#5586)
 

From 4b6808f3d1a7404eda8a33a1ff9ee900ac5ef801 Mon Sep 17 00:00:00 2001
From: cmard <87716492+cmard@users.noreply.github.com>
Date: Fri, 29 Oct 2021 13:12:22 -0400
Subject: [PATCH 09/15] Update com.unity.ml-agents/CHANGELOG.md

Co-authored-by: Miguel Alonso Jr. <76960110+miguelalonsojr@users.noreply.github.com>
---
 com.unity.ml-agents/CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md
index ea17c0e106..cc9c81c7b3 100755
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
@@ -29,7 +29,7 @@ and this project adheres to
     2. env_params.restarts_rate_limit_n (--restarts-rate-limit-n) [default=1]
     3. env_params.restarts_rate_limit_period_s (--restarts-rate-limit-period-s) [default=60]
 
-- Added a new `--deterministic` flag to make sure that actions are selected in a predictable deterministic manner. Same can
+- Added a new `--deterministic` flag to make sure that actions are selected in a predictable deterministic manner. The same thing can
 be achieved by adding `deterministic: true` under `network_settings` of the run options configuration.
 ### Bug Fixes
 - Fixed the bug where curriculum learning would crash because of the incorrect run_options parsing. (#5586)

From a507c7d1c357143fd71c5392da4753ca26c1768f Mon Sep 17 00:00:00 2001
From: cmard <87716492+cmard@users.noreply.github.com>
Date: Mon, 1 Nov 2021 09:33:59 -0400
Subject: [PATCH 10/15] Update com.unity.ml-agents/CHANGELOG.md

Co-authored-by: Maryam Honari <honari.m94@gmail.com>
---
 com.unity.ml-agents/CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md
index cc9c81c7b3..e5df21216b 100755
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
@@ -29,7 +29,7 @@ and this project adheres to
     2. env_params.restarts_rate_limit_n (--restarts-rate-limit-n) [default=1]
     3. env_params.restarts_rate_limit_period_s (--restarts-rate-limit-period-s) [default=60]
 
-- Added a new `--deterministic` flag to make sure that actions are selected in a predictable deterministic manner. The same thing can
+- Added a new `--deterministic` cli flag to deterministically select the most probable actions in policy. The same thing can
 be achieved by adding `deterministic: true` under `network_settings` of the run options configuration.
 ### Bug Fixes
 - Fixed the bug where curriculum learning would crash because of the incorrect run_options parsing. (#5586)

From 5b059fd6bc343bc9797f50125a924f3751d7aa19 Mon Sep 17 00:00:00 2001
From: cmard <87716492+cmard@users.noreply.github.com>
Date: Mon, 1 Nov 2021 09:34:05 -0400
Subject: [PATCH 11/15] Update ml-agents/mlagents/trainers/settings.py

Co-authored-by: Maryam Honari <honari.m94@gmail.com>
---
 ml-agents/mlagents/trainers/settings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 82a917900a..f0b859fa33 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -956,7 +956,7 @@ def from_argparse(args: argparse.Namespace) -> "RunOptions":
 
         _non_default_args = DetectDefault.non_default_args
 
-        # Prioritize the deterministic mode form the cli for deterministic actions.
+        # Prioritize the deterministic mode from the cli for deterministic actions.
         if "deterministic" in _non_default_args:
             for behaviour in final_runoptions.behaviors.keys():
                 final_runoptions.behaviors[

From c8eb7a92d4b6cf864773692e982438083db56ea7 Mon Sep 17 00:00:00 2001
From: cmard <87716492+cmard@users.noreply.github.com>
Date: Mon, 1 Nov 2021 09:34:12 -0400
Subject: [PATCH 12/15] Update ml-agents/mlagents/trainers/cli_utils.py

Co-authored-by: Maryam Honari <honari.m94@gmail.com>
---
 ml-agents/mlagents/trainers/cli_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ml-agents/mlagents/trainers/cli_utils.py b/ml-agents/mlagents/trainers/cli_utils.py
index ca7fd1e02e..3aa2f8b8f8 100644
--- a/ml-agents/mlagents/trainers/cli_utils.py
+++ b/ml-agents/mlagents/trainers/cli_utils.py
@@ -96,7 +96,7 @@ def _create_parser() -> argparse.ArgumentParser:
         default=False,
         dest="deterministic",
         action=DetectDefaultStoreTrue,
-        help="Whether to use the deterministic samples from the data.",
+        help="Whether to select actions deterministically in policy. `dist.mean` for continuous action space, and `dist.argmax` for deterministic action space ",
     )
     argparser.add_argument(
         "--force",

From 283ed15ea07118af11ed74b30a9a9696bd8eae9d Mon Sep 17 00:00:00 2001
From: Chingiz Mardanov <chingiz.mardanov@unity3d.com>
Date: Mon, 1 Nov 2021 10:32:21 -0400
Subject: [PATCH 13/15] Fix CR requests

---
 ml-agents/mlagents/trainers/tests/test_settings.py | 8 ++++++--
 ml-agents/mlagents/trainers/torch/action_model.py  | 2 ++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/ml-agents/mlagents/trainers/tests/test_settings.py b/ml-agents/mlagents/trainers/tests/test_settings.py
index e53d0d566d..9d14fc5cc7 100644
--- a/ml-agents/mlagents/trainers/tests/test_settings.py
+++ b/ml-agents/mlagents/trainers/tests/test_settings.py
@@ -529,7 +529,10 @@ def test_environment_settings():
 
 def test_default_settings():
     # Make default settings, one nested and one not.
-    default_settings = {"max_steps": 1, "network_settings": {"num_layers": 1000}}
+    default_settings = {
+        "max_steps": 1,
+        "network_settings": {"num_layers": 1000, "deterministic": True},
+    }
     behaviors = {"test1": {"max_steps": 2, "network_settings": {"hidden_units": 2000}}}
     run_options_dict = {"default_settings": default_settings, "behaviors": behaviors}
     run_options = RunOptions.from_dict(run_options_dict)
@@ -542,8 +545,9 @@ def test_default_settings():
     test1_settings = run_options.behaviors["test1"]
     assert test1_settings.max_steps == 2
     assert test1_settings.network_settings.hidden_units == 2000
-    assert not test1_settings.network_settings.deterministic
+    assert test1_settings.network_settings.deterministic is True
     assert test1_settings.network_settings.num_layers == 1000
+
     # Change the overridden fields back, and check if the rest are equal.
     test1_settings.max_steps = 1
     test1_settings.network_settings.hidden_units == default_settings_cls.network_settings.hidden_units
diff --git a/ml-agents/mlagents/trainers/torch/action_model.py b/ml-agents/mlagents/trainers/torch/action_model.py
index 4928fa80a1..8730e04255 100644
--- a/ml-agents/mlagents/trainers/torch/action_model.py
+++ b/ml-agents/mlagents/trainers/torch/action_model.py
@@ -44,6 +44,7 @@ def __init__(
         :params action_spec: The ActionSpec defining the action space dimensions and distributions.
         :params conditional_sigma: Whether or not the std of a Gaussian is conditioned on state.
         :params tanh_squash: Whether to squash the output of a Gaussian with the tanh function.
+        :params deterministic: Whether to select actions deterministically in policy.
         """
         super().__init__()
         self.encoding_size = hidden_size
@@ -79,6 +80,7 @@ def _sample_action(self, dists: DistInstances) -> AgentAction:
         continuous_action: Optional[torch.Tensor] = None
         discrete_action: Optional[List[torch.Tensor]] = None
         # This checks None because mypy complains otherwise
+        print(self._deterministic)
         if dists.continuous is not None:
             if self._deterministic:
                 continuous_action = dists.continuous.deterministic_sample()

From 04310256709a83e4d6a059e6895dbb851831f4a0 Mon Sep 17 00:00:00 2001
From: Chingiz Mardanov <chingiz.mardanov@unity3d.com>
Date: Wed, 3 Nov 2021 13:53:04 -0400
Subject: [PATCH 14/15] Add tests for discrete.

---
 ml-agents/mlagents/trainers/tests/torch/test_action_model.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ml-agents/mlagents/trainers/tests/torch/test_action_model.py b/ml-agents/mlagents/trainers/tests/torch/test_action_model.py
index 1365c5fdba..1dc8b21a54 100644
--- a/ml-agents/mlagents/trainers/tests/torch/test_action_model.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_action_model.py
@@ -54,6 +54,9 @@ def test_deterministic_sample_action():
     agent_action3 = action_model._sample_action(dists)
     assert torch.equal(agent_action1.continuous_tensor, agent_action2.continuous_tensor)
     assert torch.equal(agent_action1.continuous_tensor, agent_action3.continuous_tensor)
+    assert torch.equal(agent_action1.discrete_tensor, agent_action2.discrete_tensor)
+    assert torch.equal(agent_action1.discrete_tensor, agent_action3.discrete_tensor)
+
     action_model, masks = create_action_model(inp_size, act_size, deterministic=False)
     sample_inp = torch.ones((1, inp_size))
     dists = action_model._get_dists(sample_inp, masks=masks)
@@ -66,6 +69,8 @@ def test_deterministic_sample_action():
     assert not torch.equal(
         agent_action1.continuous_tensor, agent_action3.continuous_tensor
     )
+    assert not torch.equal(agent_action1.discrete_tensor, agent_action2.discrete_tensor)
+    assert not torch.equal(agent_action1.discrete_tensor, agent_action3.discrete_tensor)
 
 
 def test_get_probs_and_entropy():

From 98da4b1717f565223094e83291e1e29e58281354 Mon Sep 17 00:00:00 2001
From: cmard <87716492+cmard@users.noreply.github.com>
Date: Wed, 3 Nov 2021 14:09:45 -0400
Subject: [PATCH 15/15] Update
 ml-agents/mlagents/trainers/torch/distributions.py

Co-authored-by: Maryam Honari <honari.m94@gmail.com>
---
 ml-agents/mlagents/trainers/torch/distributions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ml-agents/mlagents/trainers/torch/distributions.py b/ml-agents/mlagents/trainers/torch/distributions.py
index 00aee3aaa0..c60426c998 100644
--- a/ml-agents/mlagents/trainers/torch/distributions.py
+++ b/ml-agents/mlagents/trainers/torch/distributions.py
@@ -124,7 +124,7 @@ def sample(self):
         return torch.multinomial(self.probs, 1)
 
     def deterministic_sample(self):
-        return torch.argmax(self.probs).reshape((1, 1))
+        return torch.argmax(self.probs, dim=1, keepdim=True)
 
     def pdf(self, value):
         # This function is equivalent to torch.diag(self.probs.T[value.flatten().long()]),