Merge pull request #51 from Chen001117/dev

huangshiyu13 · web-flow · commit eb48cda46179 · 2023-05-06T20:00:46.000+08:00
Add with torch no grad while calculating reward function
diff --git a/openrl/envs/nlp/rewards/kl_penalty.py b/openrl/envs/nlp/rewards/kl_penalty.py
@@ -66,12 +66,13 @@ def __call__(
             self._ref_net, input_ids, past_model_kwargs
         )
 
-        output = self._ref_net(output_hidden_states=True, **model_inputs)
-        output["past_key_values"] = None
-        next_token_logits = output.logits[:, -1, :]
-        dist = self._action_dist.proba_distribution(action_logits=next_token_logits)
-        action_input = actions.to(next_token_logits.device)
-        ref_log_prob = dist.log_prob(action_input)
+        with torch.no_grad():
+            output = self._ref_net(output_hidden_states=True, **model_inputs)
+            output["past_key_values"] = None
+            next_token_logits = output.logits[:, -1, :]
+            dist = self._action_dist.proba_distribution(action_logits=next_token_logits)
+            action_input = actions.to(next_token_logits.device)
+            ref_log_prob = dist.log_prob(action_input)
 
         ref_log_prob = ref_log_prob.reshape(action_log_probs.shape)
         kl_div = action_log_probs.copy() - ref_log_prob.detach().cpu().numpy()