|
19 | 19 |
|
20 | 20 | import torch |
21 | 21 | import torch_npu |
22 | | -from vllm.config import CompilationLevel, get_current_vllm_config |
| 22 | +from vllm.config import get_current_vllm_config |
23 | 23 | from vllm.distributed import (get_dp_group, get_ep_group, get_tp_group, |
24 | 24 | tensor_model_parallel_all_reduce) |
25 | 25 | from vllm.forward_context import get_forward_context |
@@ -51,20 +51,7 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod): |
51 | 51 | def __init__(self, moe: FusedMoEConfig = None): |
52 | 52 |
|
53 | 53 | super().__init__(moe=moe) |
54 | | - |
55 | | - # NOTE: Currently, this self.use_aclgraph is only used in |
56 | | - # UnquantizedFusedMoEMethod.forward_oot to decide whether to use in |
57 | | - # ops/fused_moe.py:568 to circumvent torch.randint_like not supported issue. |
58 | | - # Once torch.randint_like is supported or removed, this flag can be removed. |
59 | | - vllm_config = get_current_vllm_config() |
60 | | - ascend_config = get_ascend_config() |
61 | 54 | self.dynamic_eplb = get_ascend_config().dynamic_eplb |
62 | | - if ascend_config.torchair_graph_config.enabled: |
63 | | - self.use_aclgraph = False |
64 | | - else: |
65 | | - self.use_aclgraph = (vllm_config.compilation_config.level |
66 | | - == CompilationLevel.PIECEWISE and |
67 | | - not vllm_config.model_config.enforce_eager) |
68 | 55 | self.transpose = True |
69 | 56 |
|
70 | 57 | def process_weights_after_loading(self, layer): |
@@ -133,7 +120,7 @@ def apply(self, |
133 | 120 | # this is a naive implementation for experts load balance so as |
134 | 121 | # to avoid accumulating too much tokens on a single rank. |
135 | 122 | # currently it is only activated when doing profile runs. |
136 | | - if enable_force_load_balance and not self.use_aclgraph: |
| 123 | + if enable_force_load_balance: |
137 | 124 | topk_ids = torch.randint_like(topk_ids, 0, global_num_experts) |
138 | 125 |
|
139 | 126 | moe_comm_method = get_forward_context().moe_comm_method |
|
0 commit comments