Source code for actorcritic.policies

"""Contains `policies` that determine the behavior of an `agent`."""


from abc import ABCMeta, abstractmethod

import tensorflow as tf


[docs]class Policy(object, metaclass=ABCMeta): """Base class for stochastic policies. """ @property @abstractmethod def sample(self): """:obj:`tf.Tensor`: Samples actions from this policy based on the inputs that are provided for computing the probabilities. The shape equals the shape of the inputs. """ pass @property @abstractmethod def mode(self): """:obj:`tf.Tensor`: Selects actions from this policy which have the highest probability (mode) based on the inputs that are provided for computing the probabilities. The shape equals the shape of the inputs. """ pass @property @abstractmethod def entropy(self): """:obj:`tf.Tensor`: Computes the entropy of this policy based on the inputs that are provided for computing the probabilities. The shape equals the shape of the inputs. """ pass @property @abstractmethod def log_prob(self): """:obj:`tf.Tensor`: Computes the log-probability of the given actions based on the inputs that are provided for computing the probabilities. The shape equals the shape of the actions and the inputs. """ pass
[docs] def register_predictive_distribution(self, layer_collection, random_seed=None): """Registers the predictive distribution of this policy in the specified :obj:`kfac.LayerCollection` (required for K-FAC). Args: layer_collection (:obj:`kfac.LayerCollection`): A layer collection used by the :obj:`~kfac.KfacOptimizer`. random_seed (:obj:`int`, optional): A random seed for sampling from the predictive distribution. Raises: :obj:`NotImplementedError`: If this policy does not support K-FAC. """ raise NotImplementedError()
[docs]class DistributionPolicy(Policy, metaclass=ABCMeta): """Base class for stochastic policies that follow a concrete :obj:`tf.distributions.Distribution`. Implements the required methods based on this distribution. """
[docs] def __init__(self, distribution, actions, random_seed=None): """ Args: distribution (:obj:`tf.distributions.Distribution`): The distribution. actions (:obj:`tf.Tensor`): The input actions used to compute the log-probabilities. Must have the same shape as the inputs. random_seed (:obj:`int`, optional): A random seed used for sampling. """ self._distribution = distribution self._sample = tf.squeeze(distribution.sample(sample_shape=[], seed=random_seed, name='sample'), axis=-1) self._mode = tf.squeeze(distribution.mode(name='mode'), axis=-1) self._entropy = distribution.entropy(name='entropy') self._log_prob = distribution.log_prob(tf.stop_gradient(tf.cast(actions, tf.int32)), name='log_prob')
@property def sample(self): """:obj:`tf.Tensor`: Samples actions from this policy based on the inputs that are provided for computing the probabilities. The shape equals the shape of the inputs. """ return self._sample @property def mode(self): """:obj:`tf.Tensor`: Selects actions from this policy which have the highest probability (mode) based on the inputs that are provided for computing the probabilities. The shape equals the shape of the inputs. """ return self._mode @property def entropy(self): """:obj:`tf.Tensor`: Computes the entropy of this policy based on the inputs that are provided for computing the probabilities. The shape equals the shape of the inputs. """ return self._entropy @property def log_prob(self): """:obj:`tf.Tensor`: Computes the log-probability of the given actions based on the inputs that are provided for computing the probabilities. The shape equals the shape of the actions and the inputs. """ return self._log_prob
[docs]class SoftmaxPolicy(DistributionPolicy): """A stochastic policy that follows a categorical distribution. """
[docs] def __init__(self, logits, actions, random_seed=None, name=None): """ Args: logits (:obj:`tf.Tensor`): The input logits (or 'scores') used to compute the probabilities. actions (:obj:`tf.Tensor`): The input actions used to compute the log-probabilities. Must have the same shape as `logits`. random_seed (:obj:`int`, optional): A random seed used for sampling. name (:obj:`string`, optional): A name for this policy. """ with tf.name_scope(name, 'SoftmaxPolicy'): super().__init__(tf.distributions.Categorical(logits, name='distribution'), actions, random_seed)
[docs] def register_predictive_distribution(self, layer_collection, random_seed=None): """Registers the predictive distribution of this policy in the specified :obj:`kfac.LayerCollection` (required for K-FAC). Args: layer_collection (:obj:`kfac.LayerCollection`): A layer collection used by the :obj:`~kfac.KfacOptimizer`. random_seed (:obj:`int`, optional): A random seed for sampling from the predictive distribution. """ return layer_collection.register_categorical_predictive_distribution( logits=self._distribution.logits, seed=random_seed)