Behavior Cloning

royf · royf · commit 8c1869534397 · 2018-01-15T23:54:52.000-08:00
diff --git a/python/ray/rllib/bc/bc_evaluator.py b/python/ray/rllib/bc/bc_evaluator.py
@@ -7,7 +7,7 @@
 
 import ray
 from ray.rllib.bc.experience_dataset import ExperienceDataset
-from ray.rllib.bc.policy import Policy
+from ray.rllib.bc.policy import BCPolicy
 from ray.rllib.models import ModelCatalog
 from ray.rllib.optimizers import Evaluator
 
@@ -17,7 +17,7 @@ def __init__(self, registry, env_creator, config, logdir):
         env = ModelCatalog.get_preprocessor_as_wrapper(registry, env_creator(config["env_config"]), config["model"])
         self.dataset = ExperienceDataset(config["dataset_path"])
         # TODO(rliaw): should change this to be just env.observation_space
-        self.policy = Policy(registry, env.observation_space.shape, env.action_space, config)
+        self.policy = BCPolicy(registry, env.observation_space.shape, env.action_space, config)
         self.config = config
         self.logdir = logdir
         self.metrics_queue = queue.Queue()
diff --git a/python/ray/rllib/bc/experience_dataset.py b/python/ray/rllib/bc/experience_dataset.py
@@ -10,6 +10,16 @@
 
 class ExperienceDataset(object):
     def __init__(self, dataset_path):
+        """Create dataset of experience to imitate.
+
+        Parameters
+        ----------
+        dataset_path:
+          Path of file containing the database as pickled list of trajectories,
+          each trajectory being a list of steps,
+          each step containing the observation and action as its first two elements.
+          The file must be available on each machine used by a BCEvaluator.
+        """
         self._dataset = list(itertools.chain.from_iterable(pickle.load(open(dataset_path, "rb"))))
 
     def sample(self, batch_size):
diff --git a/python/ray/rllib/bc/policy.py b/python/ray/rllib/bc/policy.py
@@ -2,12 +2,29 @@
 from __future__ import division
 from __future__ import print_function
 
+import ray
 import tensorflow as tf
-from ray.rllib.bc.tfpolicy import TFPolicy
+from ray.rllib.a3c.policy import Policy
 from ray.rllib.models.catalog import ModelCatalog
 
 
-class Policy(TFPolicy):
+class BCPolicy(Policy):
+    def __init__(self, registry, ob_space, action_space, config, name="local", summarize=True):
+        super(BCPolicy, self).__init__(ob_space, action_space, name, summarize)
+        self.registry = registry
+        self.local_steps = 0
+        self.config = config
+        self.summarize = summarize
+        worker_device = "/job:localhost/replica:0/task:0/cpu:0"
+        self.g = tf.Graph()
+        with self.g.as_default(), tf.device(worker_device):
+            with tf.variable_scope(name):
+                self._setup_graph(ob_space, action_space)
+            print("Setting up loss")
+            self.setup_loss(action_space)
+            self.setup_gradients()
+            self.initialize()
+
     def _setup_graph(self, ob_space, ac_space):
         self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
         dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
@@ -25,6 +42,29 @@ def setup_loss(self, action_space):
         self.pi_loss = - tf.reduce_sum(log_prob)
         self.loss = self.pi_loss
 
+    def setup_gradients(self):
+        grads = tf.gradients(self.loss, self.var_list)
+        self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"])
+        grads_and_vars = list(zip(self.grads, self.var_list))
+        opt = tf.train.AdamOptimizer(self.config["lr"])
+        self._apply_gradients = opt.apply_gradients(grads_and_vars)
+
+    def initialize(self):
+        if self.summarize:
+            bs = tf.to_float(tf.shape(self.x)[0])
+            tf.summary.scalar("model/policy_loss", self.pi_loss / bs)
+            tf.summary.scalar("model/grad_gnorm", tf.global_norm(self.grads))
+            tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list))
+            self.summary_op = tf.summary.merge_all()
+
+        # TODO(rliaw): Can consider exposing these parameters
+        self.sess = tf.Session(graph=self.g, config=tf.ConfigProto(
+            intra_op_parallelism_threads=1, inter_op_parallelism_threads=2,
+            gpu_options=tf.GPUOptions(allow_growth=True)))
+        self.variables = ray.experimental.TensorFlowVariables(self.loss,
+                                                              self.sess)
+        self.sess.run(tf.global_variables_initializer())
+
     def compute_gradients(self, samples):
         info = {}
         feed_dict = {
@@ -42,6 +82,18 @@ def compute_gradients(self, samples):
         info["loss"] = loss
         return grad, info
 
+    def apply_gradients(self, grads):
+        feed_dict = {self.grads[i]: grads[i]
+                     for i in range(len(grads))}
+        self.sess.run(self._apply_gradients, feed_dict=feed_dict)
+
+    def get_weights(self):
+        weights = self.variables.get_weights()
+        return weights
+
+    def set_weights(self, weights):
+        self.variables.set_weights(weights)
+
     def compute(self, ob, *args):
         action = self.sess.run(self.sample, {self.x: [ob]})
         return action, None
diff --git a/python/ray/rllib/bc/tfpolicy.py b/python/ray/rllib/bc/tfpolicy.py
diff --git a/python/ray/rllib/eval.py b/python/ray/rllib/eval.py
diff --git a/python/ray/rllib/rollout.py b/python/ray/rllib/rollout.py
@@ -35,10 +35,13 @@
          "tune registry.")
 required_named.add_argument(
     "--env", type=str, help="The gym environment to use.")
-required_named.add_argument(
-    "--steps", type=str, help="Number of steps to roll out.")
-required_named.add_argument(
-    "--out", type=str, help="Output filename.")
+parser.add_argument(
+    "--no-render", default=False, action="store_const", const=True,
+    help="Surpress rendering of the environment.")
+parser.add_argument(
+    "--steps", default=None, help="Number of steps to roll out.")
+parser.add_argument(
+    "--out", default=None, help="Output filename.")
 parser.add_argument(
     "--config", default="{}", type=json.loads,
     help="Algorithm-specific configuration (e.g. env, hyperparams), ")
@@ -59,16 +62,23 @@
     num_steps = int(args.steps)
 
     env = ModelCatalog.get_preprocessor_as_wrapper(get_registry(), gym.make(args.env))
-    rollouts = []
+    if args.out is not None:
+        rollouts = []
     steps = 0
-    while steps < num_steps:
-        rollout = []
+    while steps < (num_steps or steps + 1):
+        if args.out is not None:
+            rollout = []
         state = env.reset()
         done = False
-        while not done and steps < num_steps:
+        while not done and steps < (num_steps or steps + 1):
             action = agent.compute_action(state)
             next_state, reward, done, _ = env.step(action)
-            rollout.append([state, action, next_state, reward, done])
+            if not args.no_render:
+                env.render()
+            if args.out is not None:
+                rollout.append([state, action, next_state, reward, done])
             steps += 1
-        rollouts.append(rollout)
-    pickle.dump(rollouts, open(args.out, "wb"))
+        if args.out is not None:
+            rollouts.append(rollout)
+    if args.out is not None:
+        pickle.dump(rollouts, open(args.out, "wb"))