IPPO

Bases: IPPOBase

IPPO clip agent using the GAE (PPO2) for calculating the advantage. The actor loss function standardizes the advantage.

Source code in jaxagents\ippo.py

class IPPO(IPPOBase):

    """
    IPPO clip agent using the GAE (PPO2) for calculating the advantage. The actor loss function standardizes the
    advantage.
    """

    @partial(jax.jit, static_argnums=(0,))
    def _trajectory_returns(self, value: Float[Array, "batch_size"], traj: Transition) -> Tuple[float, float]:
        """
        Calculates the returns per episode step over a batch of trajectories.
        :param value: The values of the steps in the trajectory according to the critic (including the one of the last
        state). In the begining of the method, 'value' is the value of the state in the next step in the trajectory
        (not the reverse iteration), and after calculation it is the value of the examined state in the examined step.
        :param traj: The trajectory batch.
        :return: An array of returns.
        """
        rewards, discounts, next_state_values, gae_lambda = traj
        value = rewards + discounts * ((1 - gae_lambda) * next_state_values + gae_lambda * value)
        return value, value

    @partial(jax.jit, static_argnums=(0,))
    def _trajectory_advantages(self, advantage: Float[Array, "batch_size"], traj: Transition) -> Tuple[float, float]:
        """
        Calculates the GAE per episode step over a batch of trajectories.
        :param advantage: The GAE advantages of the steps in the trajectory according to the critic (including the one
        of the last state). In the beginning of the method, 'advantage' is the advantage of the state in the next step
        in the trajectory (not the reverse iteration), and after calculation it is the advantage of the examined state
        in each step.
        :param traj: The trajectory batch.
        :return: An array of returns.
        """
        rewards, values, next_state_values, terminated, gamma, gae_lambda = traj
        d_t = rewards + (1 - terminated) * gamma * next_state_values - values  # Temporal difference residual at time t
        advantage = d_t + gamma * gae_lambda * (1 - terminated) * advantage
        return advantage, advantage

    @partial(jax.jit, static_argnums=(0,))
    def _actor_loss(
            self,
            training: TrainState,
            obs: Annotated[ObsType, "n_rollout batch_size"],
            actions: Annotated[ActionType, "batch_size"],
            log_prob_old: Float[Array, "n_rollout batch_size"],
            advantage: ReturnsType,
            hyperparams: HyperParameters
    )-> Tuple[Float[Array, "1"], Float[Array, "1"]]:
        """
        Calculates the actor loss. For the REINFORCE agent, the advantage function is the difference between the
        discounted returns and the value as estimated by the critic.
        :param training: The actor TrainState object.
        :param obs: The obs in the trajectory batch.
        :param actions: The actions in the trajectory batch.
        :param log_prob_old: Log-probabilities of the old policy collected over the trajectory batch.
        :param advantage: The GAE over the trajectory batch.
        :param hyperparams: The HyperParameters object used for training.
        :return: A tuple containing the actor loss and the KL divergence (for early checking stopping criterion).
        """

        """ Standardize GAE, greatly improves behaviour"""
        advantage = (advantage - advantage.mean(axis=0)) / (advantage.std(axis=0) + 1e-8)

        log_prob_vmap = jax.vmap(jax.vmap(self._log_prob, in_axes=(None, None, 0, 0)), in_axes=(None, None, 0, 0))
        log_prob = log_prob_vmap(training, training.params, obs, actions)
        log_policy_ratio = log_prob - log_prob_old
        policy_ratio = jnp.exp(log_policy_ratio)
        kl = jnp.sum(-log_policy_ratio)

        """
        Adopt simplified formulation of clipped policy ratio * advantage as explained in the note of:
        https://spinningup.openai.com/en/latest/algorithms/ppo.html#id2
        """
        clip = jnp.where(jnp.greater(advantage, 0), 1 + hyperparams.eps_clip, 1 - hyperparams.eps_clip)
        advantage_clip = advantage * clip

        """Actual clip calculation - not used but left here for comparison to simplified version"""
        # advantage_clip = jnp.clip(policy_ratio, 1 - hyperparams.eps_clip, 1 + hyperparams.eps_clip) * advantage

        loss_actor = jnp.minimum(policy_ratio * advantage, advantage_clip)

        entropy_vmap = jax.vmap(jax.vmap(self._entropy, in_axes=(None, 0)), in_axes=(None, 0))
        entropy = entropy_vmap(training, obs)

        total_loss_actor = loss_actor.mean() + hyperparams.ent_coeff * entropy.mean()

        """ Negative loss, because we want ascent but 'apply_gradients' applies descent """
        return -total_loss_actor, kl

    @partial(jax.jit, static_argnums=(0,))
    def _critic_loss(
            self,
            training: TrainState,
            obs: Annotated[ObsType, "n_rollout batch_size"],
            targets: ReturnsType,
            hyperparams: HyperParameters
    ) -> Float[Array, "1"]:
        """
        Calculates the critic loss.
        :param training: The critic TrainState object.
        :param obs: The obs in the trajectory batch.
        :param targets: The targets over the trajectory batch for training the critic.
        :param hyperparams: The HyperParameters object used for training.
        :return: The critic loss.
        """

        value_vmap = jax.vmap(jax.vmap(training.apply_fn, in_axes=(None, 0)), in_axes=(None, 0))
        value = value_vmap(training.params, obs)
        residuals = value - targets
        value_loss = jnp.mean(residuals ** 2)
        critic_total_loss = hyperparams.vf_coeff * value_loss

        return critic_total_loss

    @partial(jax.jit, static_argnums=(0,))
    def _actor_loss_input(self, update_runner: Runner, traj_batch: Transition) -> ActorLossInputType:
        """
        Prepares the input required by the actor loss function. For the PPO agent, this entails the:
        - the actions collected over the trajectory batch.
        - the log-probability of the actions collected over the trajectory batch.
        - the returns over the trajectory batch.
        - the values over the trajectory batch as evaluated by the critic.
        - the training hyperparameters.
        The input is reshaped so that it is split into minibatches.
        :param update_runner: The Runner object used in training.
        :param traj_batch: The batch of trajectories.
        :return: A tuple of input to the actor loss function.
        """

        # Shuffle the trajectory batch to collect minibatches.
        # Poor practice in using the random key, which however doesn't influence the training, since all trajectories in
        # the batch are used per epoch.
        minibatch_idx = jax.random.choice(
            jax.random.PRNGKey(1),
            jnp.arange(self.config.batch_size),
            replace=False,
            shape=(self.config.batch_size,)
        )

        traj_minibatch = jax.tree_map(lambda x: jnp.take(x, minibatch_idx, axis=0), traj_batch)
        traj_minibatch = jax.tree_map(lambda x: x.reshape(-1, self.config.minibatch_size, *x.shape[1:]), traj_minibatch)

        return (
            traj_minibatch.obs,
            traj_minibatch.action,
            traj_minibatch.log_prob,
            traj_minibatch.advantage,
            update_runner.hyperparams
        )

    @partial(jax.jit, static_argnums=(0,))
    def _critic_loss_input(self, update_runner: Runner, traj_batch: Transition) -> CriticLossInputType:
        """
        Prepares the input required by the critic loss function. For the PPO agent, this entails the:
        - the states collected over the trajectory batch.
        - the targets (returns = GAE + next_value) over the trajectory batch.
        - the training hyperparameters.
        The input is reshaped so that it is split into minibatches.
        :param update_runner: The Runner object used in training.
        :param traj_batch: The batch of trajectories.
        :return: A tuple of input to the critic loss function.
        """

        # Shuffle the trajectory batch to collect minibatches.
        # Poor practice in using the random key, which however doesn't influence the training, since all trajectories in
        # the batch are used per epoch.
        minibatch_idx = jax.random.choice(
            jax.random.PRNGKey(1),
            jnp.arange(self.config.batch_size),
            replace=False,
            shape=(self.config.batch_size,)
        )

        traj_minibatch = jax.tree_map(lambda x: jnp.take(x, minibatch_idx, axis=0), traj_batch)
        traj_minibatch = jax.tree_map(lambda x: x.reshape(-1, self.config.minibatch_size, *x.shape[1:]), traj_minibatch)

        return (
            traj_minibatch.obs,
            traj_minibatch.advantage + traj_minibatch.value,
            update_runner.hyperparams
        )

`_actor_loss(training, obs, actions, log_prob_old, advantage, hyperparams)` ¶

Calculates the actor loss. For the REINFORCE agent, the advantage function is the difference between the discounted returns and the value as estimated by the critic. :param training: The actor TrainState object. :param obs: The obs in the trajectory batch. :param actions: The actions in the trajectory batch. :param log_prob_old: Log-probabilities of the old policy collected over the trajectory batch. :param advantage: The GAE over the trajectory batch. :param hyperparams: The HyperParameters object used for training. :return: A tuple containing the actor loss and the KL divergence (for early checking stopping criterion).

Source code in jaxagents\ippo.py

@partial(jax.jit, static_argnums=(0,))
def _actor_loss(
        self,
        training: TrainState,
        obs: Annotated[ObsType, "n_rollout batch_size"],
        actions: Annotated[ActionType, "batch_size"],
        log_prob_old: Float[Array, "n_rollout batch_size"],
        advantage: ReturnsType,
        hyperparams: HyperParameters
)-> Tuple[Float[Array, "1"], Float[Array, "1"]]:
    """
    Calculates the actor loss. For the REINFORCE agent, the advantage function is the difference between the
    discounted returns and the value as estimated by the critic.
    :param training: The actor TrainState object.
    :param obs: The obs in the trajectory batch.
    :param actions: The actions in the trajectory batch.
    :param log_prob_old: Log-probabilities of the old policy collected over the trajectory batch.
    :param advantage: The GAE over the trajectory batch.
    :param hyperparams: The HyperParameters object used for training.
    :return: A tuple containing the actor loss and the KL divergence (for early checking stopping criterion).
    """

    """ Standardize GAE, greatly improves behaviour"""
    advantage = (advantage - advantage.mean(axis=0)) / (advantage.std(axis=0) + 1e-8)

    log_prob_vmap = jax.vmap(jax.vmap(self._log_prob, in_axes=(None, None, 0, 0)), in_axes=(None, None, 0, 0))
    log_prob = log_prob_vmap(training, training.params, obs, actions)
    log_policy_ratio = log_prob - log_prob_old
    policy_ratio = jnp.exp(log_policy_ratio)
    kl = jnp.sum(-log_policy_ratio)

    """
    Adopt simplified formulation of clipped policy ratio * advantage as explained in the note of:
    https://spinningup.openai.com/en/latest/algorithms/ppo.html#id2
    """
    clip = jnp.where(jnp.greater(advantage, 0), 1 + hyperparams.eps_clip, 1 - hyperparams.eps_clip)
    advantage_clip = advantage * clip

    """Actual clip calculation - not used but left here for comparison to simplified version"""
    # advantage_clip = jnp.clip(policy_ratio, 1 - hyperparams.eps_clip, 1 + hyperparams.eps_clip) * advantage

    loss_actor = jnp.minimum(policy_ratio * advantage, advantage_clip)

    entropy_vmap = jax.vmap(jax.vmap(self._entropy, in_axes=(None, 0)), in_axes=(None, 0))
    entropy = entropy_vmap(training, obs)

    total_loss_actor = loss_actor.mean() + hyperparams.ent_coeff * entropy.mean()

    """ Negative loss, because we want ascent but 'apply_gradients' applies descent """
    return -total_loss_actor, kl

`_actor_loss_input(update_runner, traj_batch)` ¶

Prepares the input required by the actor loss function. For the PPO agent, this entails the: - the actions collected over the trajectory batch. - the log-probability of the actions collected over the trajectory batch. - the returns over the trajectory batch. - the values over the trajectory batch as evaluated by the critic. - the training hyperparameters. The input is reshaped so that it is split into minibatches. :param update_runner: The Runner object used in training. :param traj_batch: The batch of trajectories. :return: A tuple of input to the actor loss function.

Source code in jaxagents\ippo.py

@partial(jax.jit, static_argnums=(0,))
def _actor_loss_input(self, update_runner: Runner, traj_batch: Transition) -> ActorLossInputType:
    """
    Prepares the input required by the actor loss function. For the PPO agent, this entails the:
    - the actions collected over the trajectory batch.
    - the log-probability of the actions collected over the trajectory batch.
    - the returns over the trajectory batch.
    - the values over the trajectory batch as evaluated by the critic.
    - the training hyperparameters.
    The input is reshaped so that it is split into minibatches.
    :param update_runner: The Runner object used in training.
    :param traj_batch: The batch of trajectories.
    :return: A tuple of input to the actor loss function.
    """

    # Shuffle the trajectory batch to collect minibatches.
    # Poor practice in using the random key, which however doesn't influence the training, since all trajectories in
    # the batch are used per epoch.
    minibatch_idx = jax.random.choice(
        jax.random.PRNGKey(1),
        jnp.arange(self.config.batch_size),
        replace=False,
        shape=(self.config.batch_size,)
    )

    traj_minibatch = jax.tree_map(lambda x: jnp.take(x, minibatch_idx, axis=0), traj_batch)
    traj_minibatch = jax.tree_map(lambda x: x.reshape(-1, self.config.minibatch_size, *x.shape[1:]), traj_minibatch)

    return (
        traj_minibatch.obs,
        traj_minibatch.action,
        traj_minibatch.log_prob,
        traj_minibatch.advantage,
        update_runner.hyperparams
    )

`_critic_loss(training, obs, targets, hyperparams)` ¶

Calculates the critic loss. :param training: The critic TrainState object. :param obs: The obs in the trajectory batch. :param targets: The targets over the trajectory batch for training the critic. :param hyperparams: The HyperParameters object used for training. :return: The critic loss.

Source code in jaxagents\ippo.py

@partial(jax.jit, static_argnums=(0,))
def _critic_loss(
        self,
        training: TrainState,
        obs: Annotated[ObsType, "n_rollout batch_size"],
        targets: ReturnsType,
        hyperparams: HyperParameters
) -> Float[Array, "1"]:
    """
    Calculates the critic loss.
    :param training: The critic TrainState object.
    :param obs: The obs in the trajectory batch.
    :param targets: The targets over the trajectory batch for training the critic.
    :param hyperparams: The HyperParameters object used for training.
    :return: The critic loss.
    """

    value_vmap = jax.vmap(jax.vmap(training.apply_fn, in_axes=(None, 0)), in_axes=(None, 0))
    value = value_vmap(training.params, obs)
    residuals = value - targets
    value_loss = jnp.mean(residuals ** 2)
    critic_total_loss = hyperparams.vf_coeff * value_loss

    return critic_total_loss

`_critic_loss_input(update_runner, traj_batch)` ¶

Prepares the input required by the critic loss function. For the PPO agent, this entails the: - the states collected over the trajectory batch. - the targets (returns = GAE + next_value) over the trajectory batch. - the training hyperparameters. The input is reshaped so that it is split into minibatches. :param update_runner: The Runner object used in training. :param traj_batch: The batch of trajectories. :return: A tuple of input to the critic loss function.

Source code in jaxagents\ippo.py

@partial(jax.jit, static_argnums=(0,))
def _critic_loss_input(self, update_runner: Runner, traj_batch: Transition) -> CriticLossInputType:
    """
    Prepares the input required by the critic loss function. For the PPO agent, this entails the:
    - the states collected over the trajectory batch.
    - the targets (returns = GAE + next_value) over the trajectory batch.
    - the training hyperparameters.
    The input is reshaped so that it is split into minibatches.
    :param update_runner: The Runner object used in training.
    :param traj_batch: The batch of trajectories.
    :return: A tuple of input to the critic loss function.
    """

    # Shuffle the trajectory batch to collect minibatches.
    # Poor practice in using the random key, which however doesn't influence the training, since all trajectories in
    # the batch are used per epoch.
    minibatch_idx = jax.random.choice(
        jax.random.PRNGKey(1),
        jnp.arange(self.config.batch_size),
        replace=False,
        shape=(self.config.batch_size,)
    )

    traj_minibatch = jax.tree_map(lambda x: jnp.take(x, minibatch_idx, axis=0), traj_batch)
    traj_minibatch = jax.tree_map(lambda x: x.reshape(-1, self.config.minibatch_size, *x.shape[1:]), traj_minibatch)

    return (
        traj_minibatch.obs,
        traj_minibatch.advantage + traj_minibatch.value,
        update_runner.hyperparams
    )

`_trajectory_advantages(advantage, traj)` ¶

Calculates the GAE per episode step over a batch of trajectories. :param advantage: The GAE advantages of the steps in the trajectory according to the critic (including the one of the last state). In the beginning of the method, 'advantage' is the advantage of the state in the next step in the trajectory (not the reverse iteration), and after calculation it is the advantage of the examined state in each step. :param traj: The trajectory batch. :return: An array of returns.

Source code in jaxagents\ippo.py

@partial(jax.jit, static_argnums=(0,))
def _trajectory_advantages(self, advantage: Float[Array, "batch_size"], traj: Transition) -> Tuple[float, float]:
    """
    Calculates the GAE per episode step over a batch of trajectories.
    :param advantage: The GAE advantages of the steps in the trajectory according to the critic (including the one
    of the last state). In the beginning of the method, 'advantage' is the advantage of the state in the next step
    in the trajectory (not the reverse iteration), and after calculation it is the advantage of the examined state
    in each step.
    :param traj: The trajectory batch.
    :return: An array of returns.
    """
    rewards, values, next_state_values, terminated, gamma, gae_lambda = traj
    d_t = rewards + (1 - terminated) * gamma * next_state_values - values  # Temporal difference residual at time t
    advantage = d_t + gamma * gae_lambda * (1 - terminated) * advantage
    return advantage, advantage

`_trajectory_returns(value, traj)` ¶

Calculates the returns per episode step over a batch of trajectories. :param value: The values of the steps in the trajectory according to the critic (including the one of the last state). In the begining of the method, 'value' is the value of the state in the next step in the trajectory (not the reverse iteration), and after calculation it is the value of the examined state in the examined step. :param traj: The trajectory batch. :return: An array of returns.

Source code in jaxagents\ippo.py

@partial(jax.jit, static_argnums=(0,))
def _trajectory_returns(self, value: Float[Array, "batch_size"], traj: Transition) -> Tuple[float, float]:
    """
    Calculates the returns per episode step over a batch of trajectories.
    :param value: The values of the steps in the trajectory according to the critic (including the one of the last
    state). In the begining of the method, 'value' is the value of the state in the next step in the trajectory
    (not the reverse iteration), and after calculation it is the value of the examined state in the examined step.
    :param traj: The trajectory batch.
    :return: An array of returns.
    """
    rewards, discounts, next_state_values, gae_lambda = traj
    value = rewards + discounts * ((1 - gae_lambda) * next_state_values + gae_lambda * value)
    return value, value

IPPO

_actor_loss(training, obs, actions, log_prob_old, advantage, hyperparams) ¶

_actor_loss_input(update_runner, traj_batch) ¶

_critic_loss(training, obs, targets, hyperparams) ¶

_critic_loss_input(update_runner, traj_batch) ¶

_trajectory_advantages(advantage, traj) ¶

_trajectory_returns(value, traj) ¶

`_actor_loss(training, obs, actions, log_prob_old, advantage, hyperparams)` ¶

`_actor_loss_input(update_runner, traj_batch)` ¶

`_critic_loss(training, obs, targets, hyperparams)` ¶

`_critic_loss_input(update_runner, traj_batch)` ¶

`_trajectory_advantages(advantage, traj)` ¶

`_trajectory_returns(value, traj)` ¶