Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 12 additions & 43 deletions docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ for i in {0..15}; do hccn_tool -i $i -ping -g address x.x.x.x;done
Mooncake is the serving platform for Kimi, a leading LLM service provided by Moonshot AI. First, we need to obtain the Mooncake project. Refer to the following command:

```shell
git clone -b pooling_async_memecpy_v1 https://github.com/AscendTransport/Mooncake
git clone https://github.com/kvcache-ai/Mooncake.git
```

Update and install Python.
Expand All @@ -67,22 +67,25 @@ apt-get update
apt-get install python3
```

Install the relevant dependencies. The installation of Go is not required.
Modify Mooncake compilation option

```shell
cd Mooncake
bash dependencies.sh -y
vi mooncake-common/common.cmake
# find this row and set USE_ASCEND_DIRECT ON.
option(USE_ASCEND_DIRECT "option for using ascend npu with adxl engine" ON)
```

Install mpi

```shell
apt purge mpich libmpich-dev -y
apt purge openmpi-bin -y
apt purge openmpi-bin libopenmpi-dev -y
apt install mpich libmpich-dev -y
export CPATH=/usr/lib/aarch64-linux-gnu/mpich/include/:$CPATH
export CPATH=/usr/lib/aarch64-linux-gnu/openmpi/lib:$CPATH
apt-get install mpich libmpich-dev -y
```

Install the relevant dependencies. The installation of Go is not required.

```shell
bash dependencies.sh -y
```

Compile and install
Expand All @@ -93,8 +96,6 @@ cd build
cmake ..
make -j
make install
cp mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
cp mooncake-transfer-engine/src/libtransfer_engine.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
```

## Prefiller/Decoder Deployment
Expand All @@ -119,10 +120,6 @@ export VLLM_USE_V1=1
export HCCL_BUFFSIZE=1024
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export ASCEND_AGGREGATE_ENABLE=1 # enable aggregated transmission
export ASCEND_TRANSPORT_PRINT=0 # print ascend transport logs
export ACL_OP_INIT_MODE=1 # acl op initialization mode to prevent device id acquisition failure
export ASCEND_A3_ENABLE=1 # enable hccs transmission for A3; set to 0 for A2
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH

vllm serve /model/Qwen3-235B-A22B-W8A8 \
Expand Down Expand Up @@ -178,10 +175,6 @@ export VLLM_USE_V1=1
export HCCL_BUFFSIZE=1024
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export ASCEND_AGGREGATE_ENABLE=1
export ASCEND_TRANSPORT_PRINT=0
export ACL_OP_INIT_MODE=1
export ASCEND_A3_ENABLE=1
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH

vllm serve /model/Qwen3-235B-A22B-W8A8 \
Expand Down Expand Up @@ -237,10 +230,6 @@ export VLLM_USE_V1=1
export HCCL_BUFFSIZE=2048
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export ASCEND_AGGREGATE_ENABLE=1
export ASCEND_TRANSPORT_PRINT=0
export ACL_OP_INIT_MODE=1
export ASCEND_A3_ENABLE=1
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH

vllm serve /model/Qwen3-235B-A22B-W8A8 \
Expand Down Expand Up @@ -298,10 +287,6 @@ export VLLM_USE_V1=1
export HCCL_BUFFSIZE=2048
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export ASCEND_AGGREGATE_ENABLE=1
export ASCEND_TRANSPORT_PRINT=0
export ACL_OP_INIT_MODE=1
export ASCEND_A3_ENABLE=1
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH

vllm serve /model/Qwen3-235B-A22B-W8A8 \
Expand Down Expand Up @@ -366,10 +351,6 @@ export VLLM_USE_V1=1
export HCCL_BUFFSIZE=1024
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export ASCEND_AGGREGATE_ENABLE=1
export ASCEND_TRANSPORT_PRINT=0
export ACL_OP_INIT_MODE=1
export ASCEND_A3_ENABLE=1
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH

vllm serve /model/Qwen3-235B-A22B-W8A8 \
Expand Down Expand Up @@ -425,10 +406,6 @@ export VLLM_USE_V1=1
export HCCL_BUFFSIZE=1024
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export ASCEND_AGGREGATE_ENABLE=1
export ASCEND_TRANSPORT_PRINT=0
export ACL_OP_INIT_MODE=1
export ASCEND_A3_ENABLE=1
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH

vllm serve /model/Qwen3-235B-A22B-W8A8 \
Expand Down Expand Up @@ -484,10 +461,6 @@ export VLLM_USE_V1=1
export HCCL_BUFFSIZE=2048
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export ASCEND_AGGREGATE_ENABLE=1
export ASCEND_TRANSPORT_PRINT=0
export ACL_OP_INIT_MODE=1
export ASCEND_A3_ENABLE=1
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH

vllm serve /model/Qwen3-235B-A22B-W8A8 \
Expand Down Expand Up @@ -545,10 +518,6 @@ export VLLM_USE_V1=1
export HCCL_BUFFSIZE=2048
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export ASCEND_AGGREGATE_ENABLE=1
export ASCEND_TRANSPORT_PRINT=0
export ACL_OP_INIT_MODE=1
export ASCEND_A3_ENABLE=1
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH

vllm serve /model/Qwen3-235B-A22B-W8A8 \
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/distributed/mooncake_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -899,7 +899,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
self.device_id = device_ids[self.tp_rank] # type: ignore

if vllm_config.kv_transfer_config.get_from_extra_config(
'use_ascend_direct', False):
'use_ascend_direct', True):
hostname = self.side_channel_host
else:
hostname = f"{self.side_channel_host}:0:npu_{self.device_id}"
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/distributed/mooncake_layerwise_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -656,7 +656,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
self.device_id = device_ids[self.tp_rank] # type: ignore

if vllm_config.kv_transfer_config.get_from_extra_config(
'use_ascend_direct', False):
'use_ascend_direct', True):
hostname = self.side_channel_host
else:
hostname = f"{self.side_channel_host}:0:npu_{self.device_id}"
Expand Down