vllm-project · wangxiyuan · Nov 4, 2025 · Oct 25, 2025 · Oct 27, 2025 · Nov 4, 2025
diff --git a/docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md b/docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md
@@ -57,7 +57,7 @@ for i in {0..15}; do hccn_tool -i $i -ping -g address x.x.x.x;done
 Mooncake is the serving platform for Kimi, a leading LLM service provided by Moonshot AI. First, we need to obtain the Mooncake project. Refer to the following command:
 
 ```shell
-git clone -b pooling_async_memecpy_v1 https://github.com/AscendTransport/Mooncake
+git clone https://github.com/kvcache-ai/Mooncake.git
 ```
 
 Update and install Python.
@@ -67,22 +67,25 @@ apt-get update
 apt-get install python3
 ```
 
-Install the relevant dependencies. The installation of Go is not required.
+Modify Mooncake compilation option
 
 ```shell
 cd Mooncake
-bash dependencies.sh -y
+vi mooncake-common/common.cmake
+# find this row and set USE_ASCEND_DIRECT ON.
+option(USE_ASCEND_DIRECT "option for using ascend npu with adxl engine" ON)
 ```
 
 Install mpi
 
 ```shell
-apt purge mpich libmpich-dev -y
-apt purge openmpi-bin -y
-apt purge openmpi-bin libopenmpi-dev -y
-apt install mpich libmpich-dev -y
-export CPATH=/usr/lib/aarch64-linux-gnu/mpich/include/:$CPATH
-export CPATH=/usr/lib/aarch64-linux-gnu/openmpi/lib:$CPATH
+apt-get install mpich libmpich-dev -y
+```
+
+Install the relevant dependencies. The installation of Go is not required.
+
+```shell
+bash dependencies.sh -y
 ```
 
 Compile and install
@@ -93,8 +96,6 @@ cd build
 cmake ..
 make -j
 make install
-cp mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
-cp mooncake-transfer-engine/src/libtransfer_engine.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
 ```
 
 ## Prefiller/Decoder Deployment
@@ -119,10 +120,6 @@ export VLLM_USE_V1=1
 export HCCL_BUFFSIZE=1024
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=10
-export ASCEND_AGGREGATE_ENABLE=1  # enable aggregated transmission
-export ASCEND_TRANSPORT_PRINT=0  # print ascend transport logs
-export ACL_OP_INIT_MODE=1  # acl op initialization mode to prevent device id acquisition failure
-export ASCEND_A3_ENABLE=1  # enable hccs transmission for A3; set to 0 for A2
 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
 
 vllm serve /model/Qwen3-235B-A22B-W8A8 \
@@ -178,10 +175,6 @@ export VLLM_USE_V1=1
 export HCCL_BUFFSIZE=1024
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=10
-export ASCEND_AGGREGATE_ENABLE=1
-export ASCEND_TRANSPORT_PRINT=0
-export ACL_OP_INIT_MODE=1
-export ASCEND_A3_ENABLE=1
 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
 
 vllm serve /model/Qwen3-235B-A22B-W8A8 \
@@ -237,10 +230,6 @@ export VLLM_USE_V1=1
 export HCCL_BUFFSIZE=2048
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=10
-export ASCEND_AGGREGATE_ENABLE=1
-export ASCEND_TRANSPORT_PRINT=0
-export ACL_OP_INIT_MODE=1
-export ASCEND_A3_ENABLE=1
 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
 
 vllm serve /model/Qwen3-235B-A22B-W8A8 \
@@ -298,10 +287,6 @@ export VLLM_USE_V1=1
 export HCCL_BUFFSIZE=2048
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=10
-export ASCEND_AGGREGATE_ENABLE=1
-export ASCEND_TRANSPORT_PRINT=0
-export ACL_OP_INIT_MODE=1
-export ASCEND_A3_ENABLE=1
 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
 
 vllm serve /model/Qwen3-235B-A22B-W8A8 \
@@ -366,10 +351,6 @@ export VLLM_USE_V1=1
 export HCCL_BUFFSIZE=1024
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=10
-export ASCEND_AGGREGATE_ENABLE=1
-export ASCEND_TRANSPORT_PRINT=0
-export ACL_OP_INIT_MODE=1
-export ASCEND_A3_ENABLE=1
 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
 
 vllm serve /model/Qwen3-235B-A22B-W8A8 \
@@ -425,10 +406,6 @@ export VLLM_USE_V1=1
 export HCCL_BUFFSIZE=1024
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=10
-export ASCEND_AGGREGATE_ENABLE=1
-export ASCEND_TRANSPORT_PRINT=0
-export ACL_OP_INIT_MODE=1
-export ASCEND_A3_ENABLE=1
 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
 
 vllm serve /model/Qwen3-235B-A22B-W8A8 \
@@ -484,10 +461,6 @@ export VLLM_USE_V1=1
 export HCCL_BUFFSIZE=2048
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=10
-export ASCEND_AGGREGATE_ENABLE=1
-export ASCEND_TRANSPORT_PRINT=0
-export ACL_OP_INIT_MODE=1
-export ASCEND_A3_ENABLE=1
 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
 
 vllm serve /model/Qwen3-235B-A22B-W8A8 \
@@ -545,10 +518,6 @@ export VLLM_USE_V1=1
 export HCCL_BUFFSIZE=2048
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=10
-export ASCEND_AGGREGATE_ENABLE=1
-export ASCEND_TRANSPORT_PRINT=0
-export ACL_OP_INIT_MODE=1
-export ASCEND_A3_ENABLE=1
 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
 
 vllm serve /model/Qwen3-235B-A22B-W8A8 \

diff --git a/vllm_ascend/distributed/mooncake_connector.py b/vllm_ascend/distributed/mooncake_connector.py
@@ -899,7 +899,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         self.device_id = device_ids[self.tp_rank]  # type: ignore
 
         if vllm_config.kv_transfer_config.get_from_extra_config(
-                'use_ascend_direct', False):
+                'use_ascend_direct', True):
             hostname = self.side_channel_host
         else:
             hostname = f"{self.side_channel_host}:0:npu_{self.device_id}"

diff --git a/vllm_ascend/distributed/mooncake_layerwise_connector.py b/vllm_ascend/distributed/mooncake_layerwise_connector.py
@@ -656,7 +656,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         self.device_id = device_ids[self.tp_rank]  # type: ignore
 
         if vllm_config.kv_transfer_config.get_from_extra_config(
-                'use_ascend_direct', False):
+                'use_ascend_direct', True):
             hostname = self.side_channel_host
         else:
             hostname = f"{self.side_channel_host}:0:npu_{self.device_id}"