1717
1818import chainer
1919from chainer import functions as F
20+ from chainer import links as L
2021import gym
22+ import gym .spaces
2123import gym .wrappers
2224import numpy as np
2325
2426import chainerrl
25- from chainerrl .agents import a3c
2627from chainerrl .agents import PPO
2728from chainerrl import experiments
28- from chainerrl import links
2929from chainerrl import misc
3030from chainerrl .optimizers .nonbias_weight_decay import NonbiasWeightDecay
31- from chainerrl import policies
32-
33-
34- class A3CFFSoftmax (chainer .ChainList , a3c .A3CModel ):
35- """An example of A3C feedforward softmax policy."""
36-
37- def __init__ (self , ndim_obs , n_actions , hidden_sizes = (200 , 200 )):
38- self .pi = policies .SoftmaxPolicy (
39- model = links .MLP (ndim_obs , n_actions , hidden_sizes ))
40- self .v = links .MLP (ndim_obs , 1 , hidden_sizes = hidden_sizes )
41- super ().__init__ (self .pi , self .v )
42-
43- def pi_and_v (self , state ):
44- return self .pi (state ), self .v (state )
45-
46-
47- class A3CFFMellowmax (chainer .ChainList , a3c .A3CModel ):
48- """An example of A3C feedforward mellowmax policy."""
49-
50- def __init__ (self , ndim_obs , n_actions , hidden_sizes = (200 , 200 )):
51- self .pi = policies .MellowmaxPolicy (
52- model = links .MLP (ndim_obs , n_actions , hidden_sizes ))
53- self .v = links .MLP (ndim_obs , 1 , hidden_sizes = hidden_sizes )
54- super ().__init__ (self .pi , self .v )
55-
56- def pi_and_v (self , state ):
57- return self .pi (state ), self .v (state )
58-
59-
60- class A3CFFGaussian (chainer .Chain , a3c .A3CModel ):
61- """An example of A3C feedforward Gaussian policy."""
62-
63- def __init__ (self , obs_size , action_space ,
64- n_hidden_layers = 2 , n_hidden_channels = 64 ,
65- bound_mean = None ):
66- assert bound_mean in [False , True ]
67- super ().__init__ ()
68- hidden_sizes = (n_hidden_channels ,) * n_hidden_layers
69- with self .init_scope ():
70- self .pi = policies .FCGaussianPolicyWithStateIndependentCovariance (
71- obs_size , action_space .low .size ,
72- n_hidden_layers , n_hidden_channels ,
73- var_type = 'diagonal' , nonlinearity = F .tanh ,
74- bound_mean = bound_mean ,
75- min_action = action_space .low , max_action = action_space .high ,
76- mean_wscale = 1e-2 )
77- self .v = links .MLP (obs_size , 1 , hidden_sizes = hidden_sizes )
78-
79- def pi_and_v (self , state ):
80- return self .pi (state ), self .v (state )
8131
8232
8333def main ():
@@ -87,10 +37,6 @@ def main():
8737 parser .add_argument ('--gpu' , type = int , default = 0 )
8838 parser .add_argument ('--env' , type = str , default = 'Hopper-v2' )
8939 parser .add_argument ('--num-envs' , type = int , default = 1 )
90- parser .add_argument ('--arch' , type = str , default = 'FFGaussian' ,
91- choices = ('FFSoftmax' , 'FFMellowmax' ,
92- 'FFGaussian' ))
93- parser .add_argument ('--bound-mean' , action = 'store_true' )
9440 parser .add_argument ('--seed' , type = int , default = 0 ,
9541 help = 'Random seed [0, 2 ** 32)' )
9642 parser .add_argument ('--outdir' , type = str , default = 'results' ,
@@ -164,14 +110,49 @@ def make_batch_env(test):
164110 obs_normalizer = chainerrl .links .EmpiricalNormalization (
165111 obs_space .low .size , clip_threshold = 5 )
166112
113+ winit_last = chainer .initializers .LeCunNormal (1e-2 )
114+
167115 # Switch policy types accordingly to action space types
168- if args .arch == 'FFSoftmax' :
169- model = A3CFFSoftmax (obs_space .low .size , action_space .n )
170- elif args .arch == 'FFMellowmax' :
171- model = A3CFFMellowmax (obs_space .low .size , action_space .n )
172- elif args .arch == 'FFGaussian' :
173- model = A3CFFGaussian (obs_space .low .size , action_space ,
174- bound_mean = args .bound_mean )
116+ if isinstance (action_space , gym .spaces .Discrete ):
117+ n_actions = action_space .n
118+ policy = chainer .Sequential (
119+ L .Linear (None , 64 ),
120+ F .tanh ,
121+ L .Linear (None , 64 ),
122+ F .tanh ,
123+ L .Linear (None , n_actions , initialW = winit_last ),
124+ chainerrl .distribution .SoftmaxDistribution ,
125+ )
126+ elif isinstance (action_space , gym .spaces .Box ):
127+ action_size = action_space .low .size
128+ policy = chainer .Sequential (
129+ L .Linear (None , 64 ),
130+ F .tanh ,
131+ L .Linear (None , 64 ),
132+ F .tanh ,
133+ L .Linear (None , action_size , initialW = winit_last ),
134+ chainerrl .policies .GaussianHeadWithStateIndependentCovariance (
135+ action_size = action_size ,
136+ var_type = 'diagonal' ,
137+ var_func = lambda x : F .exp (2 * x ), # Parameterize log std
138+ var_param_init = 0 , # log std = 0 => std = 1
139+ ),
140+ )
141+ else :
142+ print ("""\
143+ This example only supports gym.spaces.Box or gym.spaces.Discrete action spaces.""" ) # NOQA
144+ return
145+
146+ vf = chainer .Sequential (
147+ L .Linear (None , 64 ),
148+ F .tanh ,
149+ L .Linear (None , 64 ),
150+ F .tanh ,
151+ L .Linear (None , 1 ),
152+ )
153+
154+ # Combine a policy and a value function into a single model
155+ model = chainerrl .links .Branched (policy , vf )
175156
176157 opt = chainer .optimizers .Adam (alpha = args .lr , eps = 1e-5 )
177158 opt .setup (model )
@@ -208,13 +189,6 @@ def lr_setter(env, agent, value):
208189 lr_decay_hook = experiments .LinearInterpolationHook (
209190 args .steps , args .lr , 0 , lr_setter )
210191
211- # Linearly decay the clipping parameter to zero
212- def clip_eps_setter (env , agent , value ):
213- agent .clip_eps = value
214-
215- clip_eps_decay_hook = experiments .LinearInterpolationHook (
216- args .steps , 0.2 , 0 , clip_eps_setter )
217-
218192 experiments .train_agent_batch_with_evaluation (
219193 agent = agent ,
220194 env = make_batch_env (False ),
@@ -230,7 +204,6 @@ def clip_eps_setter(env, agent, value):
230204 save_best_so_far_agent = False ,
231205 step_hooks = [
232206 lr_decay_hook ,
233- clip_eps_decay_hook ,
234207 ],
235208 )
236209
0 commit comments