Skip to content

IPPO

Bases: IPPOBase

IPPO clip agent using the GAE (PPO2) for calculating the advantage. The actor loss function standardizes the advantage.

Source code in jaxagents\ippo.py
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
class IPPO(IPPOBase):

    """
    IPPO clip agent using the GAE (PPO2) for calculating the advantage. The actor loss function standardizes the
    advantage.
    """

    @partial(jax.jit, static_argnums=(0,))
    def _trajectory_returns(self, value: Float[Array, "batch_size"], traj: Transition) -> Tuple[float, float]:
        """
        Calculates the returns per episode step over a batch of trajectories.
        :param value: The values of the steps in the trajectory according to the critic (including the one of the last
        state). In the begining of the method, 'value' is the value of the state in the next step in the trajectory
        (not the reverse iteration), and after calculation it is the value of the examined state in the examined step.
        :param traj: The trajectory batch.
        :return: An array of returns.
        """
        rewards, discounts, next_state_values, gae_lambda = traj
        value = rewards + discounts * ((1 - gae_lambda) * next_state_values + gae_lambda * value)
        return value, value

    @partial(jax.jit, static_argnums=(0,))
    def _trajectory_advantages(self, advantage: Float[Array, "batch_size"], traj: Transition) -> Tuple[float, float]:
        """
        Calculates the GAE per episode step over a batch of trajectories.
        :param advantage: The GAE advantages of the steps in the trajectory according to the critic (including the one
        of the last state). In the beginning of the method, 'advantage' is the advantage of the state in the next step
        in the trajectory (not the reverse iteration), and after calculation it is the advantage of the examined state
        in each step.
        :param traj: The trajectory batch.
        :return: An array of returns.
        """
        rewards, values, next_state_values, terminated, gamma, gae_lambda = traj
        d_t = rewards + (1 - terminated) * gamma * next_state_values - values  # Temporal difference residual at time t
        advantage = d_t + gamma * gae_lambda * (1 - terminated) * advantage
        return advantage, advantage

    @partial(jax.jit, static_argnums=(0,))
    def _actor_loss(
            self,
            training: TrainState,
            obs: Annotated[ObsType, "n_rollout batch_size"],
            actions: Annotated[ActionType, "batch_size"],
            log_prob_old: Float[Array, "n_rollout batch_size"],
            advantage: ReturnsType,
            hyperparams: HyperParameters
    )-> Tuple[Float[Array, "1"], Float[Array, "1"]]:
        """
        Calculates the actor loss. For the REINFORCE agent, the advantage function is the difference between the
        discounted returns and the value as estimated by the critic.
        :param training: The actor TrainState object.
        :param obs: The obs in the trajectory batch.
        :param actions: The actions in the trajectory batch.
        :param log_prob_old: Log-probabilities of the old policy collected over the trajectory batch.
        :param advantage: The GAE over the trajectory batch.
        :param hyperparams: The HyperParameters object used for training.
        :return: A tuple containing the actor loss and the KL divergence (for early checking stopping criterion).
        """

        """ Standardize GAE, greatly improves behaviour"""
        advantage = (advantage - advantage.mean(axis=0)) / (advantage.std(axis=0) + 1e-8)

        log_prob_vmap = jax.vmap(jax.vmap(self._log_prob, in_axes=(None, None, 0, 0)), in_axes=(None, None, 0, 0))
        log_prob = log_prob_vmap(training, training.params, obs, actions)
        log_policy_ratio = log_prob - log_prob_old
        policy_ratio = jnp.exp(log_policy_ratio)
        kl = jnp.sum(-log_policy_ratio)

        """
        Adopt simplified formulation of clipped policy ratio * advantage as explained in the note of:
        https://spinningup.openai.com/en/latest/algorithms/ppo.html#id2
        """
        clip = jnp.where(jnp.greater(advantage, 0), 1 + hyperparams.eps_clip, 1 - hyperparams.eps_clip)
        advantage_clip = advantage * clip

        """Actual clip calculation - not used but left here for comparison to simplified version"""
        # advantage_clip = jnp.clip(policy_ratio, 1 - hyperparams.eps_clip, 1 + hyperparams.eps_clip) * advantage

        loss_actor = jnp.minimum(policy_ratio * advantage, advantage_clip)

        entropy_vmap = jax.vmap(jax.vmap(self._entropy, in_axes=(None, 0)), in_axes=(None, 0))
        entropy = entropy_vmap(training, obs)

        total_loss_actor = loss_actor.mean() + hyperparams.ent_coeff * entropy.mean()

        """ Negative loss, because we want ascent but 'apply_gradients' applies descent """
        return -total_loss_actor, kl

    @partial(jax.jit, static_argnums=(0,))
    def _critic_loss(
            self,
            training: TrainState,
            obs: Annotated[ObsType, "n_rollout batch_size"],
            targets: ReturnsType,
            hyperparams: HyperParameters
    ) -> Float[Array, "1"]:
        """
        Calculates the critic loss.
        :param training: The critic TrainState object.
        :param obs: The obs in the trajectory batch.
        :param targets: The targets over the trajectory batch for training the critic.
        :param hyperparams: The HyperParameters object used for training.
        :return: The critic loss.
        """

        value_vmap = jax.vmap(jax.vmap(training.apply_fn, in_axes=(None, 0)), in_axes=(None, 0))
        value = value_vmap(training.params, obs)
        residuals = value - targets
        value_loss = jnp.mean(residuals ** 2)
        critic_total_loss = hyperparams.vf_coeff * value_loss

        return critic_total_loss

    @partial(jax.jit, static_argnums=(0,))
    def _actor_loss_input(self, update_runner: Runner, traj_batch: Transition) -> ActorLossInputType:
        """
        Prepares the input required by the actor loss function. For the PPO agent, this entails the:
        - the actions collected over the trajectory batch.
        - the log-probability of the actions collected over the trajectory batch.
        - the returns over the trajectory batch.
        - the values over the trajectory batch as evaluated by the critic.
        - the training hyperparameters.
        The input is reshaped so that it is split into minibatches.
        :param update_runner: The Runner object used in training.
        :param traj_batch: The batch of trajectories.
        :return: A tuple of input to the actor loss function.
        """

        # Shuffle the trajectory batch to collect minibatches.
        # Poor practice in using the random key, which however doesn't influence the training, since all trajectories in
        # the batch are used per epoch.
        minibatch_idx = jax.random.choice(
            jax.random.PRNGKey(1),
            jnp.arange(self.config.batch_size),
            replace=False,
            shape=(self.config.batch_size,)
        )

        traj_minibatch = jax.tree_map(lambda x: jnp.take(x, minibatch_idx, axis=0), traj_batch)
        traj_minibatch = jax.tree_map(lambda x: x.reshape(-1, self.config.minibatch_size, *x.shape[1:]), traj_minibatch)

        return (
            traj_minibatch.obs,
            traj_minibatch.action,
            traj_minibatch.log_prob,
            traj_minibatch.advantage,
            update_runner.hyperparams
        )

    @partial(jax.jit, static_argnums=(0,))
    def _critic_loss_input(self, update_runner: Runner, traj_batch: Transition) -> CriticLossInputType:
        """
        Prepares the input required by the critic loss function. For the PPO agent, this entails the:
        - the states collected over the trajectory batch.
        - the targets (returns = GAE + next_value) over the trajectory batch.
        - the training hyperparameters.
        The input is reshaped so that it is split into minibatches.
        :param update_runner: The Runner object used in training.
        :param traj_batch: The batch of trajectories.
        :return: A tuple of input to the critic loss function.
        """

        # Shuffle the trajectory batch to collect minibatches.
        # Poor practice in using the random key, which however doesn't influence the training, since all trajectories in
        # the batch are used per epoch.
        minibatch_idx = jax.random.choice(
            jax.random.PRNGKey(1),
            jnp.arange(self.config.batch_size),
            replace=False,
            shape=(self.config.batch_size,)
        )

        traj_minibatch = jax.tree_map(lambda x: jnp.take(x, minibatch_idx, axis=0), traj_batch)
        traj_minibatch = jax.tree_map(lambda x: x.reshape(-1, self.config.minibatch_size, *x.shape[1:]), traj_minibatch)

        return (
            traj_minibatch.obs,
            traj_minibatch.advantage + traj_minibatch.value,
            update_runner.hyperparams
        )

_actor_loss(training, obs, actions, log_prob_old, advantage, hyperparams)

Calculates the actor loss. For the REINFORCE agent, the advantage function is the difference between the discounted returns and the value as estimated by the critic. :param training: The actor TrainState object. :param obs: The obs in the trajectory batch. :param actions: The actions in the trajectory batch. :param log_prob_old: Log-probabilities of the old policy collected over the trajectory batch. :param advantage: The GAE over the trajectory batch. :param hyperparams: The HyperParameters object used for training. :return: A tuple containing the actor loss and the KL divergence (for early checking stopping criterion).

Source code in jaxagents\ippo.py
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
@partial(jax.jit, static_argnums=(0,))
def _actor_loss(
        self,
        training: TrainState,
        obs: Annotated[ObsType, "n_rollout batch_size"],
        actions: Annotated[ActionType, "batch_size"],
        log_prob_old: Float[Array, "n_rollout batch_size"],
        advantage: ReturnsType,
        hyperparams: HyperParameters
)-> Tuple[Float[Array, "1"], Float[Array, "1"]]:
    """
    Calculates the actor loss. For the REINFORCE agent, the advantage function is the difference between the
    discounted returns and the value as estimated by the critic.
    :param training: The actor TrainState object.
    :param obs: The obs in the trajectory batch.
    :param actions: The actions in the trajectory batch.
    :param log_prob_old: Log-probabilities of the old policy collected over the trajectory batch.
    :param advantage: The GAE over the trajectory batch.
    :param hyperparams: The HyperParameters object used for training.
    :return: A tuple containing the actor loss and the KL divergence (for early checking stopping criterion).
    """

    """ Standardize GAE, greatly improves behaviour"""
    advantage = (advantage - advantage.mean(axis=0)) / (advantage.std(axis=0) + 1e-8)

    log_prob_vmap = jax.vmap(jax.vmap(self._log_prob, in_axes=(None, None, 0, 0)), in_axes=(None, None, 0, 0))
    log_prob = log_prob_vmap(training, training.params, obs, actions)
    log_policy_ratio = log_prob - log_prob_old
    policy_ratio = jnp.exp(log_policy_ratio)
    kl = jnp.sum(-log_policy_ratio)

    """
    Adopt simplified formulation of clipped policy ratio * advantage as explained in the note of:
    https://spinningup.openai.com/en/latest/algorithms/ppo.html#id2
    """
    clip = jnp.where(jnp.greater(advantage, 0), 1 + hyperparams.eps_clip, 1 - hyperparams.eps_clip)
    advantage_clip = advantage * clip

    """Actual clip calculation - not used but left here for comparison to simplified version"""
    # advantage_clip = jnp.clip(policy_ratio, 1 - hyperparams.eps_clip, 1 + hyperparams.eps_clip) * advantage

    loss_actor = jnp.minimum(policy_ratio * advantage, advantage_clip)

    entropy_vmap = jax.vmap(jax.vmap(self._entropy, in_axes=(None, 0)), in_axes=(None, 0))
    entropy = entropy_vmap(training, obs)

    total_loss_actor = loss_actor.mean() + hyperparams.ent_coeff * entropy.mean()

    """ Negative loss, because we want ascent but 'apply_gradients' applies descent """
    return -total_loss_actor, kl

_actor_loss_input(update_runner, traj_batch)

Prepares the input required by the actor loss function. For the PPO agent, this entails the: - the actions collected over the trajectory batch. - the log-probability of the actions collected over the trajectory batch. - the returns over the trajectory batch. - the values over the trajectory batch as evaluated by the critic. - the training hyperparameters. The input is reshaped so that it is split into minibatches. :param update_runner: The Runner object used in training. :param traj_batch: The batch of trajectories. :return: A tuple of input to the actor loss function.

Source code in jaxagents\ippo.py
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
@partial(jax.jit, static_argnums=(0,))
def _actor_loss_input(self, update_runner: Runner, traj_batch: Transition) -> ActorLossInputType:
    """
    Prepares the input required by the actor loss function. For the PPO agent, this entails the:
    - the actions collected over the trajectory batch.
    - the log-probability of the actions collected over the trajectory batch.
    - the returns over the trajectory batch.
    - the values over the trajectory batch as evaluated by the critic.
    - the training hyperparameters.
    The input is reshaped so that it is split into minibatches.
    :param update_runner: The Runner object used in training.
    :param traj_batch: The batch of trajectories.
    :return: A tuple of input to the actor loss function.
    """

    # Shuffle the trajectory batch to collect minibatches.
    # Poor practice in using the random key, which however doesn't influence the training, since all trajectories in
    # the batch are used per epoch.
    minibatch_idx = jax.random.choice(
        jax.random.PRNGKey(1),
        jnp.arange(self.config.batch_size),
        replace=False,
        shape=(self.config.batch_size,)
    )

    traj_minibatch = jax.tree_map(lambda x: jnp.take(x, minibatch_idx, axis=0), traj_batch)
    traj_minibatch = jax.tree_map(lambda x: x.reshape(-1, self.config.minibatch_size, *x.shape[1:]), traj_minibatch)

    return (
        traj_minibatch.obs,
        traj_minibatch.action,
        traj_minibatch.log_prob,
        traj_minibatch.advantage,
        update_runner.hyperparams
    )

_critic_loss(training, obs, targets, hyperparams)

Calculates the critic loss. :param training: The critic TrainState object. :param obs: The obs in the trajectory batch. :param targets: The targets over the trajectory batch for training the critic. :param hyperparams: The HyperParameters object used for training. :return: The critic loss.

Source code in jaxagents\ippo.py
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
@partial(jax.jit, static_argnums=(0,))
def _critic_loss(
        self,
        training: TrainState,
        obs: Annotated[ObsType, "n_rollout batch_size"],
        targets: ReturnsType,
        hyperparams: HyperParameters
) -> Float[Array, "1"]:
    """
    Calculates the critic loss.
    :param training: The critic TrainState object.
    :param obs: The obs in the trajectory batch.
    :param targets: The targets over the trajectory batch for training the critic.
    :param hyperparams: The HyperParameters object used for training.
    :return: The critic loss.
    """

    value_vmap = jax.vmap(jax.vmap(training.apply_fn, in_axes=(None, 0)), in_axes=(None, 0))
    value = value_vmap(training.params, obs)
    residuals = value - targets
    value_loss = jnp.mean(residuals ** 2)
    critic_total_loss = hyperparams.vf_coeff * value_loss

    return critic_total_loss

_critic_loss_input(update_runner, traj_batch)

Prepares the input required by the critic loss function. For the PPO agent, this entails the: - the states collected over the trajectory batch. - the targets (returns = GAE + next_value) over the trajectory batch. - the training hyperparameters. The input is reshaped so that it is split into minibatches. :param update_runner: The Runner object used in training. :param traj_batch: The batch of trajectories. :return: A tuple of input to the critic loss function.

Source code in jaxagents\ippo.py
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
@partial(jax.jit, static_argnums=(0,))
def _critic_loss_input(self, update_runner: Runner, traj_batch: Transition) -> CriticLossInputType:
    """
    Prepares the input required by the critic loss function. For the PPO agent, this entails the:
    - the states collected over the trajectory batch.
    - the targets (returns = GAE + next_value) over the trajectory batch.
    - the training hyperparameters.
    The input is reshaped so that it is split into minibatches.
    :param update_runner: The Runner object used in training.
    :param traj_batch: The batch of trajectories.
    :return: A tuple of input to the critic loss function.
    """

    # Shuffle the trajectory batch to collect minibatches.
    # Poor practice in using the random key, which however doesn't influence the training, since all trajectories in
    # the batch are used per epoch.
    minibatch_idx = jax.random.choice(
        jax.random.PRNGKey(1),
        jnp.arange(self.config.batch_size),
        replace=False,
        shape=(self.config.batch_size,)
    )

    traj_minibatch = jax.tree_map(lambda x: jnp.take(x, minibatch_idx, axis=0), traj_batch)
    traj_minibatch = jax.tree_map(lambda x: x.reshape(-1, self.config.minibatch_size, *x.shape[1:]), traj_minibatch)

    return (
        traj_minibatch.obs,
        traj_minibatch.advantage + traj_minibatch.value,
        update_runner.hyperparams
    )

_trajectory_advantages(advantage, traj)

Calculates the GAE per episode step over a batch of trajectories. :param advantage: The GAE advantages of the steps in the trajectory according to the critic (including the one of the last state). In the beginning of the method, 'advantage' is the advantage of the state in the next step in the trajectory (not the reverse iteration), and after calculation it is the advantage of the examined state in each step. :param traj: The trajectory batch. :return: An array of returns.

Source code in jaxagents\ippo.py
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
@partial(jax.jit, static_argnums=(0,))
def _trajectory_advantages(self, advantage: Float[Array, "batch_size"], traj: Transition) -> Tuple[float, float]:
    """
    Calculates the GAE per episode step over a batch of trajectories.
    :param advantage: The GAE advantages of the steps in the trajectory according to the critic (including the one
    of the last state). In the beginning of the method, 'advantage' is the advantage of the state in the next step
    in the trajectory (not the reverse iteration), and after calculation it is the advantage of the examined state
    in each step.
    :param traj: The trajectory batch.
    :return: An array of returns.
    """
    rewards, values, next_state_values, terminated, gamma, gae_lambda = traj
    d_t = rewards + (1 - terminated) * gamma * next_state_values - values  # Temporal difference residual at time t
    advantage = d_t + gamma * gae_lambda * (1 - terminated) * advantage
    return advantage, advantage

_trajectory_returns(value, traj)

Calculates the returns per episode step over a batch of trajectories. :param value: The values of the steps in the trajectory according to the critic (including the one of the last state). In the begining of the method, 'value' is the value of the state in the next step in the trajectory (not the reverse iteration), and after calculation it is the value of the examined state in the examined step. :param traj: The trajectory batch. :return: An array of returns.

Source code in jaxagents\ippo.py
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
@partial(jax.jit, static_argnums=(0,))
def _trajectory_returns(self, value: Float[Array, "batch_size"], traj: Transition) -> Tuple[float, float]:
    """
    Calculates the returns per episode step over a batch of trajectories.
    :param value: The values of the steps in the trajectory according to the critic (including the one of the last
    state). In the begining of the method, 'value' is the value of the state in the next step in the trajectory
    (not the reverse iteration), and after calculation it is the value of the examined state in the examined step.
    :param traj: The trajectory batch.
    :return: An array of returns.
    """
    rewards, discounts, next_state_values, gae_lambda = traj
    value = rewards + discounts * ((1 - gae_lambda) * next_state_values + gae_lambda * value)
    return value, value