Merge pull request #529 from muupan/iqn-act-deterministically

muupan · web-flow · commit 1feffd83f74f · 2019-09-17T15:49:51.000+09:00
Add a deterministic mode to IQN for stable tests
diff --git a/chainerrl/agents/iqn.py b/chainerrl/agents/iqn.py
@@ -235,6 +235,11 @@ class IQN(dqn.DQN):
             to sample from the return distribution at the next state.
         quantile_thresholds_K (int): Number of quantile thresholds used to
             compute greedy actions.
+        act_deterministically (bool): IQN's action selection is by default
+            stochastic as it samples quantile thresholds every time it acts,
+            even for evaluation. If this option is set to True, it uses
+            equally spaced quantile thresholds instead of randomly sampled ones
+            for evaluation, making its action selection deterministic.
 
     For other arguments, see chainerrl.agents.DQN.
     """
@@ -246,6 +251,7 @@ def __init__(self, *args, **kwargs):
         self.quantile_thresholds_N_prime = kwargs.pop(
             'quantile_thresholds_N_prime', 64)
         self.quantile_thresholds_K = kwargs.pop('quantile_thresholds_K', 32)
+        self.act_deterministically = kwargs.pop('act_deterministically', False)
         super().__init__(*args, **kwargs)
 
     def _compute_target_values(self, exp_batch):
@@ -357,7 +363,16 @@ def _evaluate_model_and_update_recurrent_states(self, batch_obs, test):
                     batch_xs, self.train_recurrent_states)
         else:
             tau2av = self.model(batch_xs)
-        taus_tilde = self.xp.random.uniform(
-            0, 1,
-            size=(len(batch_obs), self.quantile_thresholds_K)).astype('f')
+        if test and self.act_deterministically:
+            # Instead of uniform sampling, use a deterministic sequence of
+            # equally spaced numbers from 0 to 1 as quantile thresholds.
+            taus_tilde = self.xp.broadcast_to(
+                self.xp.linspace(
+                    0, 1, num=self.quantile_thresholds_K, dtype=np.float32),
+                (len(batch_obs), self.quantile_thresholds_K),
+            )
+        else:
+            taus_tilde = self.xp.random.uniform(
+                0, 1,
+                size=(len(batch_obs), self.quantile_thresholds_K)).astype('f')
         return tau2av(taus_tilde)
diff --git a/tests/agents_tests/test_double_iqn.py b/tests/agents_tests/test_double_iqn.py
@@ -46,6 +46,7 @@ def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
             replay_start_size=100, target_update_interval=100,
             quantile_thresholds_N=self.quantile_thresholds_N,
             quantile_thresholds_N_prime=self.quantile_thresholds_N_prime,
+            act_deterministically=True,
         )
 
 
@@ -76,4 +77,5 @@ def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
             quantile_thresholds_N=32,
             quantile_thresholds_N_prime=32,
             recurrent=True,
+            act_deterministically=True,
         )
diff --git a/tests/agents_tests/test_iqn.py b/tests/agents_tests/test_iqn.py
@@ -49,6 +49,7 @@ def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
             replay_start_size=100, target_update_interval=100,
             quantile_thresholds_N=self.quantile_thresholds_N,
             quantile_thresholds_N_prime=self.quantile_thresholds_N_prime,
+            act_deterministically=True,
         )
 
 
@@ -79,6 +80,7 @@ def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
             quantile_thresholds_N=32,
             quantile_thresholds_N_prime=32,
             recurrent=True,
+            act_deterministically=True,
         )
 
 

Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,7 @@ def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):`
`46`	`46`	`replay_start_size=100, target_update_interval=100,`
`47`	`47`	`quantile_thresholds_N=self.quantile_thresholds_N,`
`48`	`48`	`quantile_thresholds_N_prime=self.quantile_thresholds_N_prime,`
	`49`	`+ act_deterministically=True,`
`49`	`50`	`)`
`50`	`51`
`51`	`52`
`@@ -76,4 +77,5 @@ def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):`
`76`	`77`	`quantile_thresholds_N=32,`
`77`	`78`	`quantile_thresholds_N_prime=32,`
`78`	`79`	`recurrent=True,`
	`80`	`+ act_deterministically=True,`
`79`	`81`	`)`
Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,7 @@ def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):`
`49`	`49`	`replay_start_size=100, target_update_interval=100,`
`50`	`50`	`quantile_thresholds_N=self.quantile_thresholds_N,`
`51`	`51`	`quantile_thresholds_N_prime=self.quantile_thresholds_N_prime,`
	`52`	`+ act_deterministically=True,`
`52`	`53`	`)`
`53`	`54`
`54`	`55`
`@@ -79,6 +80,7 @@ def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):`
`79`	`80`	`quantile_thresholds_N=32,`
`80`	`81`	`quantile_thresholds_N_prime=32,`
`81`	`82`	`recurrent=True,`
	`83`	`+ act_deterministically=True,`
`82`	`84`	`)`
`83`	`85`
`84`	`86`