Remove useless env (#4858)

wangxiyuan · web-flow · commit f917d5edcf1b · 2025-12-11T06:51:07.000+08:00
cleanup useless env. These envs are not used anymore `VLLM_ASCEND_TRACE_RECOMPILES`, `VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE`, `VLLM_ASCEND_MLA_PA`, `PHYSICAL_DEVICES` - vLLM version: v0.12.0 - vLLM main: vllm-project/vllm@ad32e3e Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -138,7 +138,6 @@ def test_models_distributed_Qwen3_W4A8DYNAMIC_new_version(model):
 
 
 @pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
-@patch.dict(os.environ, {"VLLM_ASCEND_MLA_PA": "1"})
 @patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
 def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
     prompts = [
diff --git a/tests/ut/kv_connector/test_mooncake_connector.py b/tests/ut/kv_connector/test_mooncake_connector.py
@@ -1055,7 +1055,6 @@ def register_memory(self, *args, **kwargs):
 
 class MockEnvsAscend:
     MOONCAKE_CONNECTOR_PROTOCOL = "mock_protocol"
-    PHYSICAL_DEVICES = "10,11"
 
 
 def mock_get_tensor_model_parallel_rank():
diff --git a/tests/ut/kv_connector/test_mooncake_layerwise_connector.py b/tests/ut/kv_connector/test_mooncake_layerwise_connector.py
@@ -893,12 +893,3 @@ def test_register_kv_caches_mla_case(self):
         worker.register_kv_caches(mla_caches)
         self.assertTrue(worker.use_mla)
         self.assertEqual(len(worker.block_len), 2)
-
-    def test_device_id_selection_with_physical_devices(self):
-        worker = MooncakeLayerwiseConnectorWorker(self.vllm_config,
-                                                  self.engine_id)
-        self.assertIsNotNone(worker.engine)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -68,9 +68,6 @@
     # that the correct package is installed.
     "VLLM_VERSION":
     lambda: os.getenv("VLLM_VERSION", None),
-    # Whether to enable the trace recompiles from pytorch.
-    "VLLM_ASCEND_TRACE_RECOMPILES":
-    lambda: bool(int(os.getenv("VLLM_ASCEND_TRACE_RECOMPILES", '0'))),
     # Whether to enable fused_experts_allgather_ep. MoeInitRoutingV3 and
     # GroupedMatmulFinalizeRouting operators are combined to implement EP.
     "VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP":
@@ -86,16 +83,6 @@
     # value to False to disable the optimized model.
     "USE_OPTIMIZED_MODEL":
     lambda: bool(int(os.getenv('USE_OPTIMIZED_MODEL', '1'))),
-    # The tolerance of the kv cache size, if the difference between the
-    # actual kv cache size and the cached kv cache size is less than this value,
-    # then the cached kv cache size will be used.
-    "VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE":
-    lambda: int(
-        os.getenv("VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE", 64)),
-    # Whether to enable mla_pa for deepseek mla decode, this flag will be removed after its available torch_npu is public accessible
-    # and the mla_pa will be the default path of deepseek decode path.
-    "VLLM_ASCEND_MLA_PA":
-    lambda: int(os.getenv("VLLM_ASCEND_MLA_PA", 0)),
     # Whether to enable MatmulAllReduce fusion kernel when tensor parallel is enabled.
     # this feature is supported in A2, and eager mode will get better performance.
     "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE":
@@ -130,10 +117,6 @@
     # this feature in eager mode will get better performance.
     "VLLM_ASCEND_ENABLE_MLP_OPTIMIZE":
     lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLP_OPTIMIZE", '0'))),
-    # Determine the number of physical devices in a non-full-use scenario
-    # caused by the initialization of the Mooncake connector.
-    "PHYSICAL_DEVICES":
-    lambda: os.getenv("PHYSICAL_DEVICES", None),
     # Whether to enable msMonitor tool to monitor the performance of vllm-ascend.
     "MSMONITOR_USE_DAEMON":
     lambda: bool(int(os.getenv("MSMONITOR_USE_DAEMON", '0'))),