Remove redundant reduce in aux_loss logging (#2095)

BestJuly · yanring · web-flow · commit 848bff1bc71f · 2025-12-02T02:03:09.000Z
Signed-off-by: Li Tao &lt;lit@nvidia.com&gt;
Signed-off-by: lit &lt;lit@nvidia.com&gt;
Co-authored-by: Zijie Yan &lt;zijiey@nvidia.com&gt;
diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py
@@ -723,6 +723,7 @@ def save_to_aux_losses_tracker(
     num_layers: int,
     reduce_group: torch.distributed.ProcessGroup = None,
     avg_group: torch.distributed.ProcessGroup = None,
+    reduce_group_has_dp: bool = False,
 ):
     """Save the auxiliary loss for logging.
     Args:
@@ -731,7 +732,10 @@ def save_to_aux_losses_tracker(
         layer_number (int): Layer index of the loss.
         num_layers (int): The number of total layers.
         reduce_group (torch.distributed.ProcessGroup): The group for reducing the loss.
-        mean_group (torch.distributed.ProcessGroup): The group for averaging the loss.
+        avg_group (torch.distributed.ProcessGroup): The group for averaging the loss.
+        reduce_group_has_dp (bool): Whether the reduce group has data parallel ranks.
+            Set this to True if the reduce group has data parallel ranks. This flag is used to
+            ensure the correct reduction in aux loss tracking.
     """
     # Skip aux loss logging if layer_number is None.
     if layer_number is None:
@@ -744,6 +748,7 @@ def save_to_aux_losses_tracker(
     tracker[name]["values"][layer_number - 1] += loss.detach()  # Aggregate the loss for the layer.
     tracker[name]["reduce_group"] = reduce_group
     tracker[name]["avg_group"] = avg_group
+    tracker[name]["reduce_group_has_dp"] = reduce_group_has_dp
 
 
 def clear_aux_losses_tracker():
@@ -768,16 +773,18 @@ def reduce_aux_losses_tracker_across_ranks(track_names: Optional[List[str]] = No
         # Reduce aux losses across ranks.
         if tracker[name].get('reduce_group') is not None:
             torch.distributed.all_reduce(values, group=tracker[name].get('reduce_group'))
+            # Need to conduct reduction across data parallel ranks. When the reduce_group
+            # does not have 'dp' attribute, do it manually.
+            if not tracker[name].get('reduce_group_has_dp', False):
+                torch.distributed.all_reduce(
+                    values,
+                    group=parallel_state.get_data_parallel_group(with_context_parallel=False),
+                    op=torch.distributed.ReduceOp.AVG,
+                )
         if tracker[name].get('avg_group') is not None:
             torch.distributed.all_reduce(
                 values, group=tracker[name]['avg_group'], op=torch.distributed.ReduceOp.AVG
             )
-        # This ensures proper loss averaging across all ranks including CP ranks
-        torch.distributed.all_reduce(
-            values,
-            group=parallel_state.get_data_parallel_group(with_context_parallel=True),
-            op=torch.distributed.ReduceOp.AVG,
-        )
 
 
 def track_moe_metrics(
@@ -805,6 +812,7 @@ def track_moe_metrics(
                     tracker[key]["values"] = torch.zeros(num_layers, device="cuda")
                     tracker[key]["reduce_group"] = None
                     tracker[key]["avg_group"] = None
+                    tracker[key]["reduce_group_has_dp"] = False
     reduce_aux_losses_tracker_across_ranks(track_names)
 
     # Get number of MoE layers
diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py
@@ -374,6 +374,7 @@ def _apply_global_aux_loss(
             global_aux_loss,
             "global_load_balancing_loss",
             self.tp_dp_cp_group,
+            reduce_group_has_dp=True,
         )
         return probs
 
@@ -384,8 +385,20 @@ def attach_and_log_load_balancing_loss(
         aux_loss: torch.Tensor,
         aux_loss_name: str,
         reduce_group: torch.distributed.ProcessGroup,
+        reduce_group_has_dp: bool = False,
     ):
-        """Attach aux loss function to activation and add to logging."""
+        """Attach aux loss function to activation and add to logging.
+
+        Args:
+            activation (torch.Tensor): The activation tensor to attach the loss to.
+            aux_loss_coeff (float): The coefficient for the auxiliary loss.
+            aux_loss (torch.Tensor): The auxiliary loss tensor.
+            aux_loss_name (str): The name of the auxiliary loss for logging.
+            reduce_group (torch.distributed.ProcessGroup): The group for reducing the loss.
+            reduce_group_has_dp (bool): Whether the reduce group has data parallel ranks.
+                Set this to True if the reduce group has data parallel ranks. This flag is used to
+                ensure the correct reduction in aux loss tracking.
+        """
         # TODO (zijiey): fix the per_layer_logging for MTP, currently it will incorrectly
         # add the aux loss logging value to other layer's since it is difficult to get the
         # correct layer_number for MTP. It does not affect the correctness of the calculation
@@ -399,6 +412,7 @@ def attach_and_log_load_balancing_loss(
             self.layer_number,
             num_layers,
             reduce_group=reduce_group,
+            reduce_group_has_dp=reduce_group_has_dp,
         )
         if self.calculate_per_token_loss:
             # Scale the aux_loss by the number of tokens.