Source code for omnisafe.algorithms.on_policy.base.ppo
# Copyright 2023 OmniSafe Team. All Rights Reserved.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.# =============================================================================="""Implementation of the PPO algorithm."""from__future__importannotationsimporttorchfromomnisafe.algorithmsimportregistryfromomnisafe.algorithms.on_policy.base.policy_gradientimportPolicyGradient
[docs]def_loss_pi(self,obs:torch.Tensor,act:torch.Tensor,logp:torch.Tensor,adv:torch.Tensor,)->torch.Tensor:r"""Computing pi/actor loss. In Proximal Policy Optimization, the loss is defined as: .. math:: L^{CLIP} = \underset{s_t \sim \rho_{\theta}}{\mathbb{E}} \left[ \min ( r_t A^{R}_{\pi_{\theta}} (s_t, a_t) , \text{clip} (r_t, 1 - \epsilon, 1 + \epsilon) A^{R}_{\pi_{\theta}} (s_t, a_t) \right] where :math:`r_t = \frac{\pi_{\theta}^{'} (a_t|s_t)}{\pi_{\theta} (a_t|s_t)}`, :math:`\epsilon` is the clip parameter, and :math:`A^{R}_{\pi_{\theta}} (s_t, a_t)` is the advantage. Args: obs (torch.Tensor): The ``observation`` sampled from buffer. act (torch.Tensor): The ``action`` sampled from buffer. logp (torch.Tensor): The ``log probability`` of action sampled from buffer. adv (torch.Tensor): The ``advantage`` processed. ``reward_advantage`` here. Returns: The loss of pi/actor. """distribution=self._actor_critic.actor(obs)logp_=self._actor_critic.actor.log_prob(act)std=self._actor_critic.actor.stdratio=torch.exp(logp_-logp)ratio_cliped=torch.clamp(ratio,1-self._cfgs.algo_cfgs.clip,1+self._cfgs.algo_cfgs.clip,)loss=-torch.min(ratio*adv,ratio_cliped*adv).mean()loss-=self._cfgs.algo_cfgs.entropy_coef*distribution.entropy().mean()# useful extra infoentropy=distribution.entropy().mean().item()self._logger.store({'Train/Entropy':entropy,'Train/PolicyRatio':ratio,'Train/PolicyStd':std,'Loss/Loss_pi':loss.mean().item(),},)returnloss