[🐛🔨 ] Fix sac target for continuous actions (#5372)

vincentpierre · web-flow · commit 1c4ceaffbafd · 2021-05-18T22:20:28.000-07:00
* Fix of the target entropy for continuous SAC

* Lowering required steps of test and remove unecessary unsqueeze

* Changing the target from -dim(a)^2 to -dim(a) by removing implicit broadcasting
diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py
@@ -385,8 +385,8 @@ def sac_policy_loss(
             all_mean_q1 = mean_q1
         if self._action_spec.continuous_size > 0:
             cont_log_probs = log_probs.continuous_tensor
-            batch_policy_loss += torch.mean(
-                _cont_ent_coef * cont_log_probs - all_mean_q1.unsqueeze(1), dim=1
+            batch_policy_loss += (
+                _cont_ent_coef * torch.sum(cont_log_probs, dim=1) - all_mean_q1
             )
         policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks)
 
@@ -426,8 +426,8 @@ def sac_entropy_loss(
         if self._action_spec.continuous_size > 0:
             with torch.no_grad():
                 cont_log_probs = log_probs.continuous_tensor
-                target_current_diff = torch.sum(
-                    cont_log_probs + self.target_entropy.continuous, dim=1
+                target_current_diff = (
+                    torch.sum(cont_log_probs, dim=1) + self.target_entropy.continuous
                 )
             # We update all the _cont_ent_coef as one block
             entropy_loss += -1 * ModelUtils.masked_mean(
diff --git a/ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
@@ -256,7 +256,7 @@ def test_2d_sac(action_sizes):
         SAC_TORCH_CONFIG.hyperparameters, buffer_init_steps=2000
     )
     config = attr.evolve(
-        SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=6000
+        SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=3000
     )
     check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
 

Original file line number	Diff line number	Diff line change
`@@ -256,7 +256,7 @@ def test_2d_sac(action_sizes):`
`256`	`256`	`SAC_TORCH_CONFIG.hyperparameters, buffer_init_steps=2000`
`257`	`257`	`)`
`258`	`258`	`config = attr.evolve(`
`259`		`- SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=6000`
	`259`	`+ SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=3000`
`260`	`260`	`)`
`261`	`261`	`check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)`
`262`	`262`