Source code for actorcritic.agents

"""Contains `agents`, which are an abstraction from environments."""

from abc import ABCMeta, abstractmethod


[docs]class Agent(object, metaclass=ABCMeta):
    """Takes environments and a model (containing a policy) and provides :meth:`interact`, which manages operations
    such as selecting actions from the model and stepping in the environments.

    See Also:
        This allows to create multi-step agents, like :obj:`SingleEnvAgent` and :obj:`MultiEnvAgent`.
    """

[docs]    @abstractmethod
    def interact(self, session):
        """Samples actions from the model, and steps in the environments.

        Args:
            session (:obj:`tf.Session`):
                A session that will be used to compute the actions.

        Returns:
            :obj:`tuple`:
                A tuple of (`observations`, `actions`, `rewards`, `terminals`, `next_observations`, `infos`).

                All values are in `batch-major` format, meaning that the rows determine the batch and the columns
                determine the time: [`batch`, `time`]. In our case the rows correspond to the environments and the
                columns correspond to the steps: [`environment`, `step`].
                The opposite is the `time-major` format: [`time`, `batch`] or [`step`, `environment`].

                Example:

                    If the agent maintains `3` environments and samples for `5` steps, the result would consist of a
                    matrix (:obj:`list` of :obj:`list`) with shape [`3`, `5`]::

                        [ [step 1, step 2, step 3, step 4, step 5],   # environment 1
                          [step 1, step 2, step 3, step 4, step 5],   # environment 2
                          [step 1, step 2, step 3, step 4, step 5] ]  # environment 3

                `observations`, `actions`, `rewards`, `terminals`, and `infos` are collected during sampling and have
                the shape [`environments`, `steps`].

                `next_observations` contains the observations that the agent received at last, but did not use for
                selecting actions yet. These e.g. can be used to bootstrap the remaining returns. Has the shape
                [`environments`, `1`].
        """
        pass


[docs]class SingleEnvAgent(Agent):
    """An agent that maintains a single environment and samples multiple steps.
    """

[docs]    def __init__(self, env, model, num_steps):
        """
        Args:
             env (:obj:`gym.Env`):
                An environment.

             model (:obj:`~actorcritic.model.ActorCriticModel`):
                A model to sample actions.

             num_steps (:obj:`int`):
                The number of steps to take in :meth:`interact`.
        """
        self._env = env
        self._model = model
        self._num_steps = num_steps

        self._observation = None  # stores observations between calls of `interact` to reuse `next_observations`

[docs]    def interact(self, session):
        """Samples actions from the model and steps in the environment.

        Args:
             session (:obj:`tf.Session`):
                A session that will be used to compute the actions.

        Returns:
            :obj:`tuple`:
                A tuple (`observations`, `actions`, `rewards`, `terminals`, `next_observations`, `infos`).

                All values are in `batch-major` format, meaning that the rows determine the batch and the columns
                determine the time: [`batch`, `time`]. In our case we have `one` environment so the row corresponds to
                the environment and the columns correspond to the steps: [`1`, `step`].
                The opposite is the `time-major` format: [`time`, `batch`] or [`step`, `1`].

                `observations`, `actions`, `rewards`, `terminals`, and `infos` are collected during sampling and have
                the shape [`1`, `steps`].

                `next_observations` contains the observation that the agent received at last, but did not use for
                selecting an action yet. This e.g. can be used to bootstrap the remaining return.
                Has the shape [`1`, `1`].
        """

        # setup time-major values [step]
        observation_steps = []
        action_steps = []
        reward_steps = []
        terminal_steps = []
        info_steps = []

        # get last observations
        next_observation = self._observation
        if next_observation is None:
            next_observation = self._env.reset()

        for _ in range(self._num_steps):
            # save current observations
            observation_steps.append(next_observation)

            action = self._model.sample_actions([[next_observation]], session)[0]
            next_observation, reward, terminal, info = self._env.step(action)

            # save current step
            action_steps.append(action)
            reward_steps.append(reward)
            terminal_steps.append(terminal)
            info_steps.append(info)

        # save for next call of `interact`
        self._observation = next_observation

        # convert from time-major [step] to batch-major values [1, step]
        observation_batch = [observation_steps]
        action_batch = [action_steps]
        reward_batch = [reward_steps]
        terminal_batch = [terminal_steps]
        info_batch = [info_steps]

        return observation_batch, action_batch, reward_batch, terminal_batch, [next_observation], info_batch


[docs]class MultiEnvAgent(Agent):
    """An agent that maintains multiple environments (via :obj:`~actorcritic.multi_env.MultiEnv`) and samples multiple
    steps.
    """

[docs]    def __init__(self, multi_env, model, num_steps):
        """
        Args:
             multi_env (:obj:`~actorcritic.multi_env.MultiEnv`):
                Multiple environments.

             model (:obj:`~actorcritic.model.ActorCriticModel`):
                A model to sample actions.

             num_steps (:obj:`int`):
                The number of steps to take in :meth:`interact`.
        """
        self._env = multi_env
        self._model = model
        self._num_steps = num_steps

        self._observations = None  # stores observations between calls of `interact` to reuse `next_observations`

[docs]    def interact(self, session):
        """Samples actions from the model, and steps in the environments.

        Args:
             session (:obj:`tf.Session`):
                A session that will be used to compute the actions.

        Returns:
            :obj:`tuple`:
                A tuple of (`observations`, `actions`, `rewards`, `terminals`, `next_observations`, `infos`).

                All values are in `batch-major` format, meaning that the rows determine the batch and the columns
                determine the time: [`batch`, `time`]. In our case the rows correspond to the environments and the
                columns correspond to the steps: [`environment`, `step`].
                The opposite is the `time-major` format: [`time`, `batch`] or [`step`, `environment`].

                Example:

                    If the agent maintains `3` environments and samples for `5` steps, the result would consist of a
                    matrix (:obj:`list` of :obj:`list`) with shape [`3`, `5`]::

                        [ [step 1, step 2, step 3, step 4, step 5],   # environment 1
                          [step 1, step 2, step 3, step 4, step 5],   # environment 2
                          [step 1, step 2, step 3, step 4, step 5] ]  # environment 3

                `observations`, `actions`, `rewards`, `terminals`, and `infos` are collected during sampling and have
                the shape [`environments`, `steps`].

                `next_observations` contains the observations that the agent received at last, but did not use for
                selecting actions yet. These e.g. can be used to bootstrap the remaining returns. Has the shape
                [`environments`, `1`].
        """

        # setup time-major values [step, env]
        observation_steps = []
        action_steps = []
        reward_steps = []
        terminal_steps = []
        info_steps = []

        # get last observations
        next_observations = self._observations
        if next_observations is None:
            next_observations = self._env.reset()

        for _ in range(self._num_steps):
            # save current observations
            observation_steps.append(next_observations)

            # convert `next_observations` from [env] to batch-major [env, 1] by transposing [1, env]
            batch_next_observations = transpose_list([next_observations])

            actions = self._model.sample_actions(batch_next_observations, session)
            next_observations, rewards, terminals, infos = self._env.step(actions)

            # save current step
            action_steps.append(actions)
            reward_steps.append(rewards)
            terminal_steps.append(terminals)
            info_steps.append(infos)

        # save for next call of `interact`
        self._observations = next_observations

        # convert from time-major [step, env] to batch-major values [env, step]
        observation_batch = transpose_list(observation_steps)
        action_batch = transpose_list(action_steps)
        reward_batch = transpose_list(reward_steps)
        terminal_batch = transpose_list(terminal_steps)
        info_batch = transpose_list(info_steps)

        return observation_batch, action_batch, reward_batch, terminal_batch, next_observations, info_batch


[docs]def transpose_list(values):
    """Transposes a list of lists. Can be used to convert from `time-major` format to `batch-major` format and vice
    versa.

    Example:
        Input::

            [[1, 2, 3, 4],
             [5, 6, 7, 8],
             [9, 10, 11, 12]]

        Output::

            [[1, 5, 9],
             [2, 6, 10],
             [3, 7, 11],
             [4, 8, 12]]

    Args:
        values (:obj:`list` of :obj:`list`):
            Values to transpose.

    Returns:
        :obj:`list` of :obj:`list`:
            The transposed values.
    """
    return list(map(list, zip(*values)))  # taken from: https://stackoverflow.com/a/6473724