Source code for actorcritic.objectives

"""Contains `objectives` that are used to optimize actor-critic models."""


from abc import ABCMeta, abstractmethod

import numpy as np
import tensorflow as tf


[docs]class ActorCriticObjective(object, metaclass=ABCMeta): """An objective takes an :obj:`~actorcritic.model.ActorCriticModel` and determines how it is optimized. It defines the loss of the policy and the loss of the baseline, and can create train operations based on these losses. """ @property @abstractmethod def policy_loss(self): """:obj:`tf.Tensor`: The current loss of the policy of the model. """ pass @property @abstractmethod def baseline_loss(self): """:obj:`tf.Tensor`: The current loss of the baseline of the model. """ pass
[docs] def optimize_separate(self, policy_optimizer, baseline_optimizer, policy_kwargs=None, baseline_kwargs=None): """Creates an operation that minimizes the policy loss and the baseline loss separately. This means that it minimizes the losses using two different optimizers. Args: policy_optimizer (:obj:`tf.train.Optimizer`): An optimizer that is used for the policy loss. baseline_optimizer (:obj:`tf.train.Optimizer`): An optimizer that is used for the baseline loss. policy_kwargs (:obj:`dict`, optional): Keyword arguments passed to the :meth:`minimize` method of the `policy_optimizer`. baseline_kwargs (:obj:`dict`, optional): Keyword arguments passed to the :meth:`minimize` method of the `baseline_optimizer`. Returns: :obj:`tf.Operation`: An operation that updates both the policy and the baseline. """ policy_op = policy_optimizer.minimize(self.policy_loss, **policy_kwargs) baseline_op = baseline_optimizer.minimize(self.baseline_loss, **baseline_kwargs) return tf.group([policy_op, baseline_op])
[docs] def optimize_shared(self, optimizer, baseline_loss_weight=0.5, **kwargs): """Creates an operation that minimizes both the policy loss and the baseline loss using the same optimizer. This is used for models that share parameters between the policy and the baseline. The shared loss is defined as:: shared_loss = policy_loss + baseline_loss_weight * baseline_loss where `baseline_loss_weight` determines the `'learning rate'` relative to the policy loss. Args: optimizer (:obj:`tf.train.Optimizer`): An optimizer that is used for both the policy loss and the baseline loss. baseline_loss_weight (:obj:`float` or :obj:`tf.Tensor`): Determines the relative `'learning rate'`. kwargs (:obj:`dict`, optional): Keyword arguments passed to the :meth:`minimize` method of the optimizer. Returns: :obj:`tf.Operation`: An operation that updates both the policy and the baseline. """ shared_loss = self.policy_loss + baseline_loss_weight * self.baseline_loss return optimizer.minimize(shared_loss, **kwargs)
[docs]class A2CObjective(ActorCriticObjective): """An objective that defines the loss of the policy and the baseline according to the A3C and A2C/ACKTR papers. The rewards are discounted and the policy loss uses entropy regularization. The baseline is optimized using a squared error loss. The policy objective uses entropy regularization:: J(theta) = log(policy(state, action | theta)) * (target_values - baseline) + beta * entropy(policy) where `beta` determines the strength of the entropy regularization. See Also: * https://arxiv.org/pdf/1602.01783.pdf (A3C) * https://arxiv.org/pdf/1708.05144.pdf (A2C/ACKTR) """
[docs] def __init__(self, model, discount_factor=0.99, entropy_regularization_strength=0.01, name=None): """ Args: model (:obj:`~actorcritic.model.ActorCriticModel`): A model that provides the policy and the baseline that will be optimized. discount_factor (:obj:`float`): Used for discounting the rewards. Should be between [0, 1]. entropy_regularization_strength (:obj:`float` or :obj:`tf.Tensor`): Determining the strength of the entropy regularization. Corresponds to the `beta` parameter in A3C. name (:obj:`string`, optional): A name for this objective. """ bootstrap_values = model.bootstrap_values rewards = model.rewards_placeholder terminals = model.terminals_placeholder policy = model.policy baseline = model.baseline with tf.name_scope(name, 'A2CObjective'): with tf.name_scope('target_values'): discounted_rewards = _discount(rewards, terminals, discount_factor) discounted_bootstrap_values = _discount_bootstrap(bootstrap_values, terminals, discount_factor) target_values = tf.stop_gradient(discounted_rewards + discounted_bootstrap_values) with tf.name_scope('advantage'): # advantage = target_values - baseline advantage = tf.stop_gradient(target_values - baseline.value) with tf.name_scope('standard_policy_objective'): # J(theta) = log(policy(state, action | theta)) * advantage # TODO reduce_sum axis=1 ? standard_policy_objective = tf.reduce_mean(advantage * policy.log_prob) with tf.name_scope('entropy_regularization'): with tf.name_scope('mean_entropy'): self._mean_entropy = tf.reduce_mean(policy.entropy) entropy_regularization = entropy_regularization_strength * self._mean_entropy with tf.name_scope('policy_objective'): # full policy objective with entropy regularization: # J(theta) = log(policy(state, action | theta)) * advantage + beta * entropy(policy) policy_objective = standard_policy_objective + entropy_regularization with tf.name_scope('policy_loss'): # maximize policy objective = minimizing the negative self._policy_loss = -policy_objective with tf.name_scope('baseline_loss'): # squared error loss for baseline (actually half squared error loss) # TODO value_function_loss = -tf.reduce_mean(advantage_function * value_function_gradient) ? self._baseline_loss = tf.reduce_mean(tf.square(target_values - baseline.value) / 2.)
@property def policy_loss(self): """:obj:`tf.Tensor`: The current loss of the policy of the model. """ return self._policy_loss @property def baseline_loss(self): """:obj:`tf.Tensor`: The current loss of the baseline of the model. """ return self._baseline_loss @property def mean_entropy(self): """:obj:`tf.Tensor`: The current mean entropy of the policy of the model. """ return self._mean_entropy
def _discount(values, terminals, discount_factor): def fn(terminals, discount_factor): batch_size, num_steps = terminals.shape discount_factors = np.triu(np.ones((num_steps, num_steps), dtype=np.float32), k=1) discount_factors = np.cumsum(discount_factors, axis=1) discount_factors = np.transpose(discount_factors) discount_factors = discount_factor ** discount_factors discount_factors = np.tril(discount_factors, k=0) discount_factors = np.expand_dims(discount_factors, axis=0) discount_factors = np.repeat(discount_factors, batch_size, axis=0) indices = np.where(terminals) indices = np.transpose(indices) for batch_index, time_index in indices: discount_factors[batch_index, time_index + 1:, :time_index + 1] = 0. return discount_factors discount_matrices = tf.py_func(fn, [terminals, discount_factor], tf.float32, stateful=True) values = tf.expand_dims(values, axis=1) discounted_values = tf.matmul(values, discount_matrices) return tf.squeeze(discounted_values, axis=1) def _discount_bootstrap(values, terminals, discount_factor): # terminal trajectories do not need bootstrapping, so returns discounted bootstrapped values # for all non-terminal sub-trajectories def fn(terminals): # returns exponentiated discount factors for all non-terminal sub-trajectories return np.flip(np.cumprod(np.cumprod(np.flip(np.invert(terminals), axis=1), axis=1, dtype=np.int32) * discount_factor, axis=1, dtype=np.float32), axis=1) discount_factors = tf.py_func(fn, [terminals], tf.float32, stateful=True) return discount_factors * tf.expand_dims(values, axis=-1) # element-wise multiplication