Source code for actorcritic.envs.atari.model

"""An implementation of an actor-critic model that is aimed at Atari games."""


import gym
import numpy as np
import tensorflow as tf

import actorcritic.nn as nn
from actorcritic.baselines import StateValueFunction
from actorcritic.model import ActorCriticModel
from actorcritic.policies import SoftmaxPolicy


[docs]class AtariModel(ActorCriticModel):
    """An :obj:`~actorcritic.model.ActorCriticModel` that follows the A3C and ACKTR paper.

    The observations are sent to three convolutional layers followed by a fully connected layer, each using rectifier
    activation functions (ReLU). The policy and the baseline use fully connected layers built on top of the last hidden
    fully connected layer separately. The policy layer has one unit for each action and its outputs are used as logits
    for a categorical distribution (softmax). The baseline layer has only one unit which represents its value.

    The weights of the layers are orthogonally initialized.

    Detailed network architecture:

        - Conv2D: 32 filters 8x8, stride 4
        - ReLU
        - Conv2D: 64 filters 4x4, stride 2
        - ReLU
        - Conv2D: 64 filters 3x3, stride 1  (number of filters based on argument `conv3_num_filters`)
        - Flatten
        - Fully connected: 512 units
        - ReLU
        - Fully connected (policy): units = number of actions / Fully connected (baseline): 1 unit

    A2C uses 64 filters in the third convolutional layer. ACKTR uses 32.

    The policy is a :obj:`~actorcritic.policies.SoftmaxPolicy`.
    The baseline is a :obj:`~actorcritic.baselines.StateValueFunction`.

    See Also:
        This network architecture was originally used in: https://www.nature.com/articles/nature14236
    """

[docs]    def __init__(self, observation_space, action_space, conv3_num_filters=64, random_seed=None, name=None):
        """
        Args:
            observation_space (:obj:`gym.spaces.Space`):
                A space that determines the shape of the :attr:`observations_placeholder` and the
                :attr:`bootstrap_observations_placeholder`.

            action_space (:obj:`gym.spaces.Space`):
                A space that determines the shape of the :attr:`actions_placeholder`.

            conv3_num_filters (:obj:`int`, optional):
                Number of filters used for the third convolutional layer, defaults to 64. ACKTR uses 32.

            random_seed (:obj:`int`, optional):
                A random seed used for sampling from the `~actorcritic.policies.SoftmaxPolicy`.

            name (:obj:`string`, optional):
                A name for this model.
        """
        super().__init__(observation_space, action_space)

        assert isinstance(action_space, gym.spaces.Discrete)
        assert isinstance(observation_space, gym.spaces.Box)

        self._num_actions = action_space.n
        self._conv3_num_filters = conv3_num_filters
        self._name = name

        # TODO

        # used to convert the outputs of the policy and the baseline back to the batch-major format of the inputs
        # because the values are flattened in between
        with tf.name_scope('shapes'):
            observations_shape = tf.shape(self.observations_placeholder)
            with tf.name_scope('input_shape'):
                input_shape = observations_shape[:2]
                with tf.name_scope('batch_size'):
                    batch_size = input_shape[0]
                with tf.name_scope('num_steps'):
                    num_steps = input_shape[1]
            with tf.name_scope('bootstrap_input_shape'):
                bootstrap_input_shape = tf.shape(self.bootstrap_observations_placeholder)[:1]

        num_stack = observation_space.shape[-1]

        # the observations are passed in uint8 to save memory and then converted to scalars in range [0,1] on the gpu
        # by dividing by 255
        with tf.name_scope('normalized_observations'):
            normalized_observations = tf.cast(self.observations_placeholder, dtype=tf.float32) / 255.0
            normalized_bootstrap_observations = tf.cast(self.bootstrap_observations_placeholder,
                                                        dtype=tf.float32) / 255.0

        # convert from batch-major format [environment, step] to one flat vector [environment * step] by stacking the
        # steps of each environment
        # this is necessary since the neural network operations only support batch inputs
        with tf.name_scope('flat_observations'):
            self._flat_observations = tf.stop_gradient(
                tf.reshape(normalized_observations, (-1,) + observation_space.shape))
            flat_bootstrap_observations = tf.stop_gradient(
                tf.reshape(normalized_bootstrap_observations, (-1,) + observation_space.shape))

        with tf.variable_scope(self._name, 'AtariModel'):
            self._params = dict()

            # create parameters for all layers
            self._build_params(num_input_channels=num_stack)

            # create layers for the policy and the baseline that use the standard observations as input
            self._preactivations, self._activations = self._build_layers(self._flat_observations, build_policy=True)

            # create layers for the bootstrap values that use the next observations as input
            _, bootstrap_activations = self._build_layers(flat_bootstrap_observations, build_policy=False)

            with tf.name_scope('policy'):
                policy_logits = tf.reshape(self._activations['fc_policy'], [batch_size, num_steps, self._num_actions])
                self._policy = SoftmaxPolicy(policy_logits, self.actions_placeholder, random_seed)

            with tf.name_scope('baseline'):
                baseline_logits = tf.reshape(self._activations['fc_baseline'], input_shape)
                self._baseline = StateValueFunction(baseline_logits)

            with tf.name_scope('bootstrap_values'):
                self._bootstrap_values = tf.reshape(bootstrap_activations['fc_baseline'], bootstrap_input_shape)

    def _build_params(self, num_input_channels):
        with tf.name_scope('initializers'):
            # values of the initializers taken from original a2c implementation
            weights_initializer = tf.orthogonal_initializer(np.sqrt(2.), dtype=tf.float32)
            bias_initializer = tf.zeros_initializer(dtype=tf.float32)
            policy_weights_initializer = tf.orthogonal_initializer(0.01, dtype=tf.float32)
            baseline_weights_initializer = tf.orthogonal_initializer(1., dtype=tf.float32)

        with tf.variable_scope('conv1'):
            conv1_num_filters = 32
            conv1_filter_extent = 8
            self._params['conv1'] = nn.conv2d_params(
                num_input_channels, conv1_num_filters, conv1_filter_extent, tf.float32,
                weights_initializer, bias_initializer)

        with tf.variable_scope('conv2'):
            conv2_num_filters = 64
            conv2_filter_extent = 4
            self._params['conv2'] = nn.conv2d_params(
                conv1_num_filters, conv2_num_filters, conv2_filter_extent, tf.float32,
                weights_initializer, bias_initializer)

        with tf.variable_scope('conv3'):
            conv3_filter_extent = 3
            self._params['conv3'] = nn.conv2d_params(
                conv2_num_filters, self._conv3_num_filters, conv3_filter_extent, tf.float32,
                weights_initializer, bias_initializer)

        conv3_flat_size = 49 * self._conv3_num_filters  # TODO don't hardcode

        with tf.variable_scope('fc4'):
            fc4_output_size = 512
            self._params['fc4'] = nn.fully_connected_params(
                conv3_flat_size, fc4_output_size, tf.float32, weights_initializer, bias_initializer)

        with tf.variable_scope('fc_policy'):
            self._params['fc_policy'] = nn.fully_connected_params(
                fc4_output_size, self._num_actions, tf.float32, policy_weights_initializer, bias_initializer)

        with tf.variable_scope('fc_baseline'):
            self._params['fc_baseline'] = nn.fully_connected_params(
                fc4_output_size, 1, tf.float32, baseline_weights_initializer, bias_initializer)

    # noinspection PyShadowingBuiltins
    def _build_layers(self, input, build_policy):
        preactivations = dict()
        activations = dict()

        with tf.variable_scope('conv1', reuse=True):
            conv1_pre = nn.conv2d(input, self._params['conv1'], stride=4, padding='VALID')
            conv1 = tf.nn.relu(conv1_pre)

            preactivations['conv1'] = conv1_pre
            activations['conv1'] = conv1

        with tf.variable_scope('conv2', reuse=True):
            conv2_pre = nn.conv2d(conv1, self._params['conv2'], stride=2, padding='VALID')
            conv2 = tf.nn.relu(conv2_pre)

            preactivations['conv2'] = conv2_pre
            activations['conv2'] = conv2

        with tf.variable_scope('conv3', reuse=True):
            conv3_pre = nn.conv2d(conv2, self._params['conv3'], stride=1, padding='VALID')
            conv3 = tf.nn.relu(conv3_pre)

            preactivations['conv3'] = conv3_pre

        with tf.name_scope('flat'):
            conv3_flat = nn.flatten(conv3)
            activations['conv3'] = conv3_flat

        with tf.variable_scope('fc4', reuse=True):
            fc4_pre = nn.fully_connected(conv3_flat, self._params['fc4'])
            fc4 = tf.nn.relu(fc4_pre)

            preactivations['fc4'] = fc4_pre
            activations['fc4'] = fc4

        if build_policy:
            with tf.variable_scope('fc_policy', reuse=True):
                fc_policy = nn.fully_connected(fc4, self._params['fc_policy'])
                activations['fc_policy'] = fc_policy

        with tf.variable_scope('fc_baseline', reuse=True):
            fc_baseline = nn.fully_connected(fc4, self._params['fc_baseline'])
            activations['fc_baseline'] = fc_baseline

        return preactivations, activations

[docs]    def register_layers(self, layer_collection):
        """Registers the layers of this model (neural net) in the specified :obj:`kfac.LayerCollection`
        (required for K-FAC).

        Args:
            layer_collection (:obj:`kfac.LayerCollection`):
                A layer collection used by the :obj:`~kfac.KfacOptimizer`.
        """
        layer_collection.register_conv2d(
            self._params['conv1'], strides=[1, 4, 4, 1], padding='VALID',
            inputs=self._flat_observations, outputs=self._preactivations['conv1'])

        layer_collection.register_conv2d(
            self._params['conv2'], strides=[1, 2, 2, 1], padding='VALID',
            inputs=self._activations['conv1'], outputs=self._preactivations['conv2'])

        layer_collection.register_conv2d(
            self._params['conv3'], strides=[1, 1, 1, 1], padding='VALID',
            inputs=self._activations['conv2'], outputs=self._preactivations['conv3'])

        layer_collection.register_fully_connected(
            self._params['fc4'], inputs=self._activations['conv3'], outputs=self._preactivations['fc4'])

        layer_collection.register_fully_connected(
            self._params['fc_policy'], inputs=self._activations['fc4'], outputs=self._activations['fc_policy'])

        layer_collection.register_fully_connected(
            self._params['fc_baseline'], inputs=self._activations['fc4'], outputs=self._activations['fc_baseline'])