diff --git a/docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md b/docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md index 9e51b4dffe8..e92cb663c4f 100644 --- a/docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md +++ b/docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md @@ -57,7 +57,7 @@ for i in {0..15}; do hccn_tool -i $i -ping -g address x.x.x.x;done Mooncake is the serving platform for Kimi, a leading LLM service provided by Moonshot AI. First, we need to obtain the Mooncake project. Refer to the following command: ```shell -git clone -b pooling_async_memecpy_v1 https://github.com/AscendTransport/Mooncake +git clone https://github.com/kvcache-ai/Mooncake.git ``` Update and install Python. @@ -67,22 +67,25 @@ apt-get update apt-get install python3 ``` -Install the relevant dependencies. The installation of Go is not required. +Modify Mooncake compilation option ```shell cd Mooncake -bash dependencies.sh -y +vi mooncake-common/common.cmake +# find this row and set USE_ASCEND_DIRECT ON. +option(USE_ASCEND_DIRECT "option for using ascend npu with adxl engine" ON) ``` Install mpi ```shell -apt purge mpich libmpich-dev -y -apt purge openmpi-bin -y -apt purge openmpi-bin libopenmpi-dev -y -apt install mpich libmpich-dev -y -export CPATH=/usr/lib/aarch64-linux-gnu/mpich/include/:$CPATH -export CPATH=/usr/lib/aarch64-linux-gnu/openmpi/lib:$CPATH +apt-get install mpich libmpich-dev -y +``` + +Install the relevant dependencies. The installation of Go is not required. + +```shell +bash dependencies.sh -y ``` Compile and install @@ -93,8 +96,6 @@ cd build cmake .. make -j make install -cp mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/ -cp mooncake-transfer-engine/src/libtransfer_engine.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/ ``` ## Prefiller/Decoder Deployment @@ -119,10 +120,6 @@ export VLLM_USE_V1=1 export HCCL_BUFFSIZE=1024 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 -export ASCEND_AGGREGATE_ENABLE=1 # enable aggregated transmission -export ASCEND_TRANSPORT_PRINT=0 # print ascend transport logs -export ACL_OP_INIT_MODE=1 # acl op initialization mode to prevent device id acquisition failure -export ASCEND_A3_ENABLE=1 # enable hccs transmission for A3; set to 0 for A2 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH vllm serve /model/Qwen3-235B-A22B-W8A8 \ @@ -178,10 +175,6 @@ export VLLM_USE_V1=1 export HCCL_BUFFSIZE=1024 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 -export ASCEND_AGGREGATE_ENABLE=1 -export ASCEND_TRANSPORT_PRINT=0 -export ACL_OP_INIT_MODE=1 -export ASCEND_A3_ENABLE=1 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH vllm serve /model/Qwen3-235B-A22B-W8A8 \ @@ -237,10 +230,6 @@ export VLLM_USE_V1=1 export HCCL_BUFFSIZE=2048 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 -export ASCEND_AGGREGATE_ENABLE=1 -export ASCEND_TRANSPORT_PRINT=0 -export ACL_OP_INIT_MODE=1 -export ASCEND_A3_ENABLE=1 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH vllm serve /model/Qwen3-235B-A22B-W8A8 \ @@ -298,10 +287,6 @@ export VLLM_USE_V1=1 export HCCL_BUFFSIZE=2048 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 -export ASCEND_AGGREGATE_ENABLE=1 -export ASCEND_TRANSPORT_PRINT=0 -export ACL_OP_INIT_MODE=1 -export ASCEND_A3_ENABLE=1 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH vllm serve /model/Qwen3-235B-A22B-W8A8 \ @@ -366,10 +351,6 @@ export VLLM_USE_V1=1 export HCCL_BUFFSIZE=1024 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 -export ASCEND_AGGREGATE_ENABLE=1 -export ASCEND_TRANSPORT_PRINT=0 -export ACL_OP_INIT_MODE=1 -export ASCEND_A3_ENABLE=1 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH vllm serve /model/Qwen3-235B-A22B-W8A8 \ @@ -425,10 +406,6 @@ export VLLM_USE_V1=1 export HCCL_BUFFSIZE=1024 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 -export ASCEND_AGGREGATE_ENABLE=1 -export ASCEND_TRANSPORT_PRINT=0 -export ACL_OP_INIT_MODE=1 -export ASCEND_A3_ENABLE=1 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH vllm serve /model/Qwen3-235B-A22B-W8A8 \ @@ -484,10 +461,6 @@ export VLLM_USE_V1=1 export HCCL_BUFFSIZE=2048 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 -export ASCEND_AGGREGATE_ENABLE=1 -export ASCEND_TRANSPORT_PRINT=0 -export ACL_OP_INIT_MODE=1 -export ASCEND_A3_ENABLE=1 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH vllm serve /model/Qwen3-235B-A22B-W8A8 \ @@ -545,10 +518,6 @@ export VLLM_USE_V1=1 export HCCL_BUFFSIZE=2048 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 -export ASCEND_AGGREGATE_ENABLE=1 -export ASCEND_TRANSPORT_PRINT=0 -export ACL_OP_INIT_MODE=1 -export ASCEND_A3_ENABLE=1 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH vllm serve /model/Qwen3-235B-A22B-W8A8 \ diff --git a/vllm_ascend/distributed/mooncake_connector.py b/vllm_ascend/distributed/mooncake_connector.py index 0118dcd444f..ca36581a167 100644 --- a/vllm_ascend/distributed/mooncake_connector.py +++ b/vllm_ascend/distributed/mooncake_connector.py @@ -899,7 +899,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str): self.device_id = device_ids[self.tp_rank] # type: ignore if vllm_config.kv_transfer_config.get_from_extra_config( - 'use_ascend_direct', False): + 'use_ascend_direct', True): hostname = self.side_channel_host else: hostname = f"{self.side_channel_host}:0:npu_{self.device_id}" diff --git a/vllm_ascend/distributed/mooncake_layerwise_connector.py b/vllm_ascend/distributed/mooncake_layerwise_connector.py index 7f7c4642bd7..e500f533f4e 100644 --- a/vllm_ascend/distributed/mooncake_layerwise_connector.py +++ b/vllm_ascend/distributed/mooncake_layerwise_connector.py @@ -656,7 +656,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str): self.device_id = device_ids[self.tp_rank] # type: ignore if vllm_config.kv_transfer_config.get_from_extra_config( - 'use_ascend_direct', False): + 'use_ascend_direct', True): hostname = self.side_channel_host else: hostname = f"{self.side_channel_host}:0:npu_{self.device_id}"