Source code for actorcritic.objectives

"""Contains `objectives` that are used to optimize actor-critic models."""


from abc import ABCMeta, abstractmethod

import numpy as np
import tensorflow as tf


[docs]class ActorCriticObjective(object, metaclass=ABCMeta):
    """An objective takes an :obj:`~actorcritic.model.ActorCriticModel` and determines how it is optimized. It defines
    the loss of the policy and the loss of the baseline, and can create train operations based on these losses.
    """

    @property
    @abstractmethod
    def policy_loss(self):
        """:obj:`tf.Tensor`:
            The current loss of the policy of the model.
        """
        pass

    @property
    @abstractmethod
    def baseline_loss(self):
        """:obj:`tf.Tensor`:
            The current loss of the baseline of the model.
        """
        pass

[docs]    def optimize_separate(self, policy_optimizer, baseline_optimizer, policy_kwargs=None, baseline_kwargs=None):
        """Creates an operation that minimizes the policy loss and the baseline loss separately. This means that it
        minimizes the losses using two different optimizers.

        Args:
            policy_optimizer (:obj:`tf.train.Optimizer`):
                An optimizer that is used for the policy loss.

            baseline_optimizer (:obj:`tf.train.Optimizer`):
                An optimizer that is used for the baseline loss.

            policy_kwargs (:obj:`dict`, optional):
                Keyword arguments passed to the :meth:`minimize` method of the `policy_optimizer`.

            baseline_kwargs (:obj:`dict`, optional):
                Keyword arguments passed to the :meth:`minimize` method of the `baseline_optimizer`.

        Returns:
            :obj:`tf.Operation`:
                An operation that updates both the policy and the baseline.
        """
        policy_op = policy_optimizer.minimize(self.policy_loss, **policy_kwargs)
        baseline_op = baseline_optimizer.minimize(self.baseline_loss, **baseline_kwargs)
        return tf.group([policy_op, baseline_op])

[docs]    def optimize_shared(self, optimizer, baseline_loss_weight=0.5, **kwargs):
        """Creates an operation that minimizes both the policy loss and the baseline loss using the same optimizer. This
        is used for models that share parameters between the policy and the baseline. The shared loss is defined as::

            shared_loss = policy_loss + baseline_loss_weight * baseline_loss

        where `baseline_loss_weight` determines the `'learning rate'` relative to the policy loss.

        Args:
            optimizer (:obj:`tf.train.Optimizer`):
                An optimizer that is used for both the policy loss and the baseline loss.

            baseline_loss_weight (:obj:`float` or :obj:`tf.Tensor`):
                Determines the relative `'learning rate'`.

            kwargs (:obj:`dict`, optional):
                Keyword arguments passed to the :meth:`minimize` method of the optimizer.

        Returns:
            :obj:`tf.Operation`:
                An operation that updates both the policy and the baseline.
        """
        shared_loss = self.policy_loss + baseline_loss_weight * self.baseline_loss
        return optimizer.minimize(shared_loss, **kwargs)


[docs]class A2CObjective(ActorCriticObjective):
    """An objective that defines the loss of the policy and the baseline according to the A3C and A2C/ACKTR papers.

    The rewards are discounted and the policy loss uses entropy regularization. The baseline is optimized using a
    squared error loss.

    The policy objective uses entropy regularization::

        J(theta) = log(policy(state, action | theta)) * (target_values - baseline) + beta * entropy(policy)

    where `beta` determines the strength of the entropy regularization.

    See Also:

        * https://arxiv.org/pdf/1602.01783.pdf  (A3C)
        * https://arxiv.org/pdf/1708.05144.pdf  (A2C/ACKTR)
    """

[docs]    def __init__(self, model, discount_factor=0.99, entropy_regularization_strength=0.01, name=None):
        """
        Args:
            model (:obj:`~actorcritic.model.ActorCriticModel`):
                A model that provides the policy and the baseline that will be optimized.

            discount_factor (:obj:`float`):
                Used for discounting the rewards. Should be between [0, 1].

            entropy_regularization_strength (:obj:`float` or :obj:`tf.Tensor`):
                Determining the strength of the entropy regularization. Corresponds to the `beta` parameter in A3C.

            name (:obj:`string`, optional):
                A name for this objective.
        """
        bootstrap_values = model.bootstrap_values
        rewards = model.rewards_placeholder
        terminals = model.terminals_placeholder

        policy = model.policy
        baseline = model.baseline

        with tf.name_scope(name, 'A2CObjective'):
            with tf.name_scope('target_values'):
                discounted_rewards = _discount(rewards, terminals, discount_factor)
                discounted_bootstrap_values = _discount_bootstrap(bootstrap_values, terminals, discount_factor)
                target_values = tf.stop_gradient(discounted_rewards + discounted_bootstrap_values)

            with tf.name_scope('advantage'):
                # advantage = target_values - baseline
                advantage = tf.stop_gradient(target_values - baseline.value)

            with tf.name_scope('standard_policy_objective'):
                # J(theta) = log(policy(state, action | theta)) * advantage
                # TODO reduce_sum axis=1 ?
                standard_policy_objective = tf.reduce_mean(advantage * policy.log_prob)

            with tf.name_scope('entropy_regularization'):
                with tf.name_scope('mean_entropy'):
                    self._mean_entropy = tf.reduce_mean(policy.entropy)
                entropy_regularization = entropy_regularization_strength * self._mean_entropy

            with tf.name_scope('policy_objective'):
                # full policy objective with entropy regularization:
                # J(theta) = log(policy(state, action | theta)) * advantage + beta * entropy(policy)
                policy_objective = standard_policy_objective + entropy_regularization

            with tf.name_scope('policy_loss'):
                # maximize policy objective = minimizing the negative
                self._policy_loss = -policy_objective

            with tf.name_scope('baseline_loss'):
                # squared error loss for baseline (actually half squared error loss)
                # TODO value_function_loss = -tf.reduce_mean(advantage_function * value_function_gradient) ?
                self._baseline_loss = tf.reduce_mean(tf.square(target_values - baseline.value) / 2.)

    @property
    def policy_loss(self):
        """:obj:`tf.Tensor`:
            The current loss of the policy of the model.
        """
        return self._policy_loss

    @property
    def baseline_loss(self):
        """:obj:`tf.Tensor`:
            The current loss of the baseline of the model.
        """
        return self._baseline_loss

    @property
    def mean_entropy(self):
        """:obj:`tf.Tensor`:
            The current mean entropy of the policy of the model.
        """
        return self._mean_entropy


def _discount(values, terminals, discount_factor):

    def fn(terminals, discount_factor):
        batch_size, num_steps = terminals.shape

        discount_factors = np.triu(np.ones((num_steps, num_steps), dtype=np.float32), k=1)
        discount_factors = np.cumsum(discount_factors, axis=1)
        discount_factors = np.transpose(discount_factors)
        discount_factors = discount_factor ** discount_factors
        discount_factors = np.tril(discount_factors, k=0)
        discount_factors = np.expand_dims(discount_factors, axis=0)
        discount_factors = np.repeat(discount_factors, batch_size, axis=0)

        indices = np.where(terminals)
        indices = np.transpose(indices)
        for batch_index, time_index in indices:
            discount_factors[batch_index, time_index + 1:, :time_index + 1] = 0.

        return discount_factors

    discount_matrices = tf.py_func(fn, [terminals, discount_factor], tf.float32, stateful=True)

    values = tf.expand_dims(values, axis=1)
    discounted_values = tf.matmul(values, discount_matrices)
    return tf.squeeze(discounted_values, axis=1)


def _discount_bootstrap(values, terminals, discount_factor):
    # terminal trajectories do not need bootstrapping, so returns discounted bootstrapped values
    # for all non-terminal sub-trajectories

    def fn(terminals):
        # returns exponentiated discount factors for all non-terminal sub-trajectories
        return np.flip(np.cumprod(np.cumprod(np.flip(np.invert(terminals), axis=1), axis=1, dtype=np.int32) * discount_factor, axis=1, dtype=np.float32), axis=1)

    discount_factors = tf.py_func(fn, [terminals], tf.float32, stateful=True)
    return discount_factors * tf.expand_dims(values, axis=-1)  # element-wise multiplication