Merge pull request #427 from muupan/parallel-link

muupan · web-flow · commit e9cd2f917a29 · 2019-04-11T14:39:22.000+09:00
Add Branched and use it to simplify train_ppo_batch_gym.py
diff --git a/.travis.yml b/.travis.yml
@@ -4,14 +4,14 @@ python:
   - "2.7"
   - "3.6"
 env:
-  - CHAINER_VERSION=3
+  - CHAINER_VERSION=4
   - CHAINER_VERSION=stable
 # command to install dependencies
 install:
   - pip install --upgrade pip setuptools wheel
   - |
-    if [[ $CHAINER_VERSION == 3 ]]; then
-      pip install "chainer==3.1.0"
+    if [[ $CHAINER_VERSION == 4 ]]; then
+      pip install "chainer==4.0.0"
     else
       pip install chainer
     fi
diff --git a/chainerrl/links/__init__.py b/chainerrl/links/__init__.py
@@ -1,3 +1,4 @@
+from chainerrl.links.branched import Branched  # NOQA
 from chainerrl.links.dqn_head import NatureDQNHead  # NOQA
 from chainerrl.links.dqn_head import NIPSDQNHead  # NOQA
 from chainerrl.links.empirical_normalization import EmpiricalNormalization  # NOQA
diff --git a/chainerrl/links/branched.py b/chainerrl/links/branched.py
@@ -0,0 +1,34 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+from builtins import *  # NOQA
+from future import standard_library
+standard_library.install_aliases()  # NOQA
+
+import chainer
+
+
+class Branched(chainer.ChainList):
+    """Link that calls forward functions of child links in parallel.
+
+    When either the `__call__` method of this link are called, all the
+    argeuments are forwarded to each child link's `__call__` method.
+
+    The returned values from the child links are returned as a tuple.
+
+    Args:
+        *links: Child links. Each link should be callable.
+    """
+
+    def __call__(self, *args, **kwargs):
+        """Forward the arguments to the child links.
+
+        Args:
+            *args, **kwargs: Any arguments forwarded to child links. Each child
+                link should be able to accept the arguments.
+
+        Returns:
+            tuple: Tuple of the returned values from the child links.
+        """
+        return tuple(link(*args, **kwargs) for link in self)
diff --git a/chainerrl/policies/gaussian_policy.py b/chainerrl/policies/gaussian_policy.py
@@ -248,3 +248,53 @@ def get_var_array(shape):
         layers.append(lambda x: distribution.GaussianDistribution(
             x, get_var_array(x.shape)))
         super().__init__(*layers)
+
+
+class GaussianHeadWithStateIndependentCovariance(chainer.Chain):
+    """Gaussian head with state-independent learned covariance.
+
+    This link is intended to be attached to a neural network that outputs
+    the mean of a Gaussian policy. The only learnable parameter this link has
+    determines the variance in a state-independent way.
+
+    State-independent parameterization of the variance of a Gaussian policy
+    is often used with PPO and TRPO, e.g., in https://arxiv.org/abs/1709.06560.
+
+    Args:
+        action_size (int): Number of dimensions of the action space.
+        var_type (str): Type of parameterization of variance. It must be
+            'spherical' or 'diagonal'.
+        var_func (callable): Callable that computes the variance from the var
+            parameter. It should always return positive values.
+        var_param_init (float): Initial value the var parameter.
+    """
+
+    def __init__(
+            self,
+            action_size,
+            var_type='spherical',
+            var_func=F.softplus,
+            var_param_init=0,
+    ):
+
+        self.var_func = var_func
+        var_size = {'spherical': 1, 'diagonal': action_size}[var_type]
+
+        super().__init__()
+        with self.init_scope():
+            self.var_param = chainer.Parameter(
+                initializer=var_param_init, shape=(var_size,))
+
+    def __call__(self, mean):
+        """Return a Gaussian with given mean.
+
+        Args:
+            mean (chainer.Variable or ndarray): Mean of Gaussian.
+
+        Returns:
+            chainerrl.distribution.Distribution: Gaussian whose mean is the
+                mean argument and whose variance is computed from the parameter
+                of this link.
+        """
+        var = F.broadcast_to(self.var_func(self.var_param), mean.shape)
+        return distribution.GaussianDistribution(mean, var)
diff --git a/docs/links.rst b/docs/links.rst
@@ -5,6 +5,8 @@ Links
 Link implementations
 ============================
 
+.. autoclass:: chainerrl.links.Branched
+
 .. autoclass:: chainerrl.links.EmpiricalNormalization
 
 .. autoclass:: chainerrl.links.FactorizedNoisyLinear
diff --git a/examples/gym/train_ppo_batch_gym.py b/examples/gym/train_ppo_batch_gym.py
@@ -17,67 +17,17 @@
 
 import chainer
 from chainer import functions as F
+from chainer import links as L
 import gym
+import gym.spaces
 import gym.wrappers
 import numpy as np
 
 import chainerrl
-from chainerrl.agents import a3c
 from chainerrl.agents import PPO
 from chainerrl import experiments
-from chainerrl import links
 from chainerrl import misc
 from chainerrl.optimizers.nonbias_weight_decay import NonbiasWeightDecay
-from chainerrl import policies
-
-
-class A3CFFSoftmax(chainer.ChainList, a3c.A3CModel):
-    """An example of A3C feedforward softmax policy."""
-
-    def __init__(self, ndim_obs, n_actions, hidden_sizes=(200, 200)):
-        self.pi = policies.SoftmaxPolicy(
-            model=links.MLP(ndim_obs, n_actions, hidden_sizes))
-        self.v = links.MLP(ndim_obs, 1, hidden_sizes=hidden_sizes)
-        super().__init__(self.pi, self.v)
-
-    def pi_and_v(self, state):
-        return self.pi(state), self.v(state)
-
-
-class A3CFFMellowmax(chainer.ChainList, a3c.A3CModel):
-    """An example of A3C feedforward mellowmax policy."""
-
-    def __init__(self, ndim_obs, n_actions, hidden_sizes=(200, 200)):
-        self.pi = policies.MellowmaxPolicy(
-            model=links.MLP(ndim_obs, n_actions, hidden_sizes))
-        self.v = links.MLP(ndim_obs, 1, hidden_sizes=hidden_sizes)
-        super().__init__(self.pi, self.v)
-
-    def pi_and_v(self, state):
-        return self.pi(state), self.v(state)
-
-
-class A3CFFGaussian(chainer.Chain, a3c.A3CModel):
-    """An example of A3C feedforward Gaussian policy."""
-
-    def __init__(self, obs_size, action_space,
-                 n_hidden_layers=2, n_hidden_channels=64,
-                 bound_mean=None):
-        assert bound_mean in [False, True]
-        super().__init__()
-        hidden_sizes = (n_hidden_channels,) * n_hidden_layers
-        with self.init_scope():
-            self.pi = policies.FCGaussianPolicyWithStateIndependentCovariance(
-                obs_size, action_space.low.size,
-                n_hidden_layers, n_hidden_channels,
-                var_type='diagonal', nonlinearity=F.tanh,
-                bound_mean=bound_mean,
-                min_action=action_space.low, max_action=action_space.high,
-                mean_wscale=1e-2)
-            self.v = links.MLP(obs_size, 1, hidden_sizes=hidden_sizes)
-
-    def pi_and_v(self, state):
-        return self.pi(state), self.v(state)
 
 
 def main():
@@ -87,10 +37,6 @@ def main():
     parser.add_argument('--gpu', type=int, default=0)
     parser.add_argument('--env', type=str, default='Hopper-v2')
     parser.add_argument('--num-envs', type=int, default=1)
-    parser.add_argument('--arch', type=str, default='FFGaussian',
-                        choices=('FFSoftmax', 'FFMellowmax',
-                                 'FFGaussian'))
-    parser.add_argument('--bound-mean', action='store_true')
     parser.add_argument('--seed', type=int, default=0,
                         help='Random seed [0, 2 ** 32)')
     parser.add_argument('--outdir', type=str, default='results',
@@ -164,14 +110,49 @@ def make_batch_env(test):
     obs_normalizer = chainerrl.links.EmpiricalNormalization(
         obs_space.low.size, clip_threshold=5)
 
+    winit_last = chainer.initializers.LeCunNormal(1e-2)
+
     # Switch policy types accordingly to action space types
-    if args.arch == 'FFSoftmax':
-        model = A3CFFSoftmax(obs_space.low.size, action_space.n)
-    elif args.arch == 'FFMellowmax':
-        model = A3CFFMellowmax(obs_space.low.size, action_space.n)
-    elif args.arch == 'FFGaussian':
-        model = A3CFFGaussian(obs_space.low.size, action_space,
-                              bound_mean=args.bound_mean)
+    if isinstance(action_space, gym.spaces.Discrete):
+        n_actions = action_space.n
+        policy = chainer.Sequential(
+            L.Linear(None, 64),
+            F.tanh,
+            L.Linear(None, 64),
+            F.tanh,
+            L.Linear(None, n_actions, initialW=winit_last),
+            chainerrl.distribution.SoftmaxDistribution,
+        )
+    elif isinstance(action_space, gym.spaces.Box):
+        action_size = action_space.low.size
+        policy = chainer.Sequential(
+            L.Linear(None, 64),
+            F.tanh,
+            L.Linear(None, 64),
+            F.tanh,
+            L.Linear(None, action_size, initialW=winit_last),
+            chainerrl.policies.GaussianHeadWithStateIndependentCovariance(
+                action_size=action_size,
+                var_type='diagonal',
+                var_func=lambda x: F.exp(2 * x),  # Parameterize log std
+                var_param_init=0,  # log std = 0 => std = 1
+            ),
+        )
+    else:
+        print("""\
+This example only supports gym.spaces.Box or gym.spaces.Discrete action spaces.""")  # NOQA
+        return
+
+    vf = chainer.Sequential(
+        L.Linear(None, 64),
+        F.tanh,
+        L.Linear(None, 64),
+        F.tanh,
+        L.Linear(None, 1),
+    )
+
+    # Combine a policy and a value function into a single model
+    model = chainerrl.links.Branched(policy, vf)
 
     opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5)
     opt.setup(model)
@@ -208,13 +189,6 @@ def lr_setter(env, agent, value):
         lr_decay_hook = experiments.LinearInterpolationHook(
             args.steps, args.lr, 0, lr_setter)
 
-        # Linearly decay the clipping parameter to zero
-        def clip_eps_setter(env, agent, value):
-            agent.clip_eps = value
-
-        clip_eps_decay_hook = experiments.LinearInterpolationHook(
-            args.steps, 0.2, 0, clip_eps_setter)
-
         experiments.train_agent_batch_with_evaluation(
             agent=agent,
             env=make_batch_env(False),
@@ -230,7 +204,6 @@ def clip_eps_setter(env, agent, value):
             save_best_so_far_agent=False,
             step_hooks=[
                 lr_decay_hook,
-                clip_eps_decay_hook,
             ],
         )
 
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 cached-property
-chainer>=3.1.0
+chainer>=4.0.0
 fastcache; python_version<'3.2'
 funcsigs; python_version<'3.5'
 future
diff --git a/tests/links_tests/test_branched.py b/tests/links_tests/test_branched.py
@@ -0,0 +1,44 @@
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import division
+from __future__ import absolute_import
+from builtins import *  # NOQA
+from future import standard_library
+standard_library.install_aliases()  # NOQA
+
+import unittest
+
+import chainer
+from chainer import functions as F
+from chainer import links as L
+from chainer import testing
+import numpy as np
+
+from chainerrl.links import Branched
+
+
+@testing.parameterize(*(
+    testing.product({
+        'batch_size': [1, 2],
+    })
+))
+class TestBranched(unittest.TestCase):
+
+    def test_manual(self):
+        link1 = L.Linear(2, 3)
+        link2 = L.Linear(2, 5)
+        link3 = chainer.Sequential(
+            L.Linear(2, 7),
+            F.tanh,
+        )
+        plink = Branched(link1, link2, link3)
+        x = np.zeros((self.batch_size, 2), dtype=np.float32)
+        pout = plink(x)
+        self.assertIsInstance(pout, tuple)
+        self.assertEqual(len(pout), 3)
+        out1 = link1(x)
+        out2 = link2(x)
+        out3 = link3(x)
+        np.testing.assert_allclose(pout[0].array, out1.array)
+        np.testing.assert_allclose(pout[1].array, out2.array)
+        np.testing.assert_allclose(pout[2].array, out3.array)

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+from chainerrl.links.branched import Branched # NOQA`
`1`	`2`	`from chainerrl.links.dqn_head import NatureDQNHead # NOQA`
`2`	`3`	`from chainerrl.links.dqn_head import NIPSDQNHead # NOQA`
`3`	`4`	`from chainerrl.links.empirical_normalization import EmpiricalNormalization # NOQA`