Skip to content

Commit 46e8acd

Browse files
committed
Add Func: npugraph_batch_size auto-adjust to different model
1 parent be9e3e8 commit 46e8acd

File tree

2 files changed

+122
-1
lines changed

2 files changed

+122
-1
lines changed
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
2+
# Copyright 2023 The vLLM team.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
# This file is a part of the vllm-ascend project.
16+
#
17+
import os
18+
19+
import pytest
20+
import torch
21+
22+
from vllm import LLM, SamplingParams
23+
from torch_npu.op_plugin.atb._atb_ops import _register_atb_extensions
24+
25+
_register_atb_extensions()
26+
torch.cuda.CUDAGraph = torch.npu.NPUGraph
27+
28+
MODELS = [
29+
"Qwen/Qwen2.5-0.5B-Instruct",
30+
]
31+
32+
TENSOR_PARALLELS = [2]
33+
34+
prompts = [
35+
"Hello, my name is",
36+
"The future of AI is",
37+
]
38+
39+
@pytest.mark.parametrize("model", MODELS)
40+
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
41+
@pytest.mark.parametrize("max_tokens", [64])
42+
@pytest.mark.parametrize("temperature", [0.0])
43+
@pytest.mark.parametrize("ignore_eos", [True])
44+
def test_models(model: str, tp_size: int, max_tokens: int, temperature: int, ignore_eos: bool) -> None:
45+
# Create an LLM.
46+
llm = LLM(
47+
model=model,
48+
tensor_parallel_size=tp_size,
49+
)
50+
# Prepare sampling_parames
51+
sampling_params = SamplingParams(
52+
max_tokens=max_tokens,
53+
temperature=temperature,
54+
ignore_eos=ignore_eos,
55+
)
56+
57+
# Generate texts from the prompts.
58+
# The output is a list of RequestOutput objects
59+
outputs = llm.generate(prompts, sampling_params)
60+
torch.npu.synchronize()
61+
# The output length should be equal to prompts length.
62+
assert len(outputs) == len(prompts)

vllm_ascend/worker/model_runner_v1.py

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import os
2222
import time
2323
import weakref
24+
import math
2425
from contextlib import contextmanager, nullcontext
2526
from dataclasses import dataclass
2627
from typing import TYPE_CHECKING, Dict, List, Optional, Union
@@ -976,7 +977,9 @@ def capture_model(self) -> None:
976977

977978
start_time = time.perf_counter()
978979
start_free_npu_memory = torch.npu.mem_get_info()[0]
979-
980+
# Since vllm npugraph_batch_sizes is too large,
981+
# we need to adjust its length to proper size.
982+
self.verify_adjust_npugraph_batch_sizes()
980983
# Trigger NPU graph capture for specific shapes.
981984
# Capture the large shapes first so that the smaller shapes
982985
# can reuse the memory pool allocated for the large shapes.
@@ -994,3 +997,59 @@ def capture_model(self) -> None:
994997
# This usually takes 5~20 seconds.
995998
logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
996999
elapsed_time, npu_graph_size / (1 << 30))
1000+
1001+
def verify_adjust_npugraph_batch_sizes(self) -> None:
1002+
# Now, vllm-ascend support max capture size is 1920
1003+
max_capture_size = 1920
1004+
original_npugraph_batch_sizes = self.npugraph_batch_sizes
1005+
num_hidden_layers = self.vllm_config.model_config.hf_config.num_hidden_layers
1006+
max_support_len_npugraph = self.get_max_support_len(max_capture_size, num_hidden_layers)
1007+
1008+
if max_support_len_npugraph < len(original_npugraph_batch_sizes):
1009+
self.npugraph_batch_sizes = self.sample_from_list(max_support_len_npugraph)
1010+
logger.info("Model:%s-num_hidden_layers:%d will adjust npugraph_bash_size, pre-adjust-len: %s, post-adjust-len: %s",
1011+
self.vllm_config.model_config.architectures[0],
1012+
num_hidden_layers,
1013+
len(original_npugraph_batch_sizes),
1014+
len(self.npugraph_batch_sizes)
1015+
)
1016+
else:
1017+
logger.info("Model:%s-num_hidden_layers:%d no need adjust npugraph_bash_size, list_len: %s",
1018+
self.vllm_config.model_config.architectures[0],
1019+
num_hidden_layers,
1020+
len(original_npugraph_batch_sizes)
1021+
)
1022+
1023+
def get_max_support_len(self, max_capture_size, num_hidden_layers) -> int:
1024+
parallel_type_cnt = 0
1025+
dp_size = self.vllm_config.parallel_config.data_parallel_size
1026+
tp_size = self.vllm_config.parallel_config.tensor_parallel_size
1027+
if dp_size > 1:
1028+
parallel_type_cnt += 1
1029+
if tp_size > 1:
1030+
parallel_type_cnt += 1
1031+
max_support_len_npugraph = math.floor(max_capture_size / (num_hidden_layers + 1) / (parallel_type_cnt + 1))
1032+
logger.info("max_capture_size:%s, dp_size:%s, tp_size:%s, parallel_type_cnt:%s, max_support_len_npugraph: %s:",
1033+
max_capture_size,
1034+
dp_size,
1035+
tp_size,
1036+
parallel_type_cnt,
1037+
max_support_len_npugraph
1038+
)
1039+
1040+
return max_support_len_npugraph
1041+
1042+
def sample_from_list(self, sample_len) -> list[int]:
1043+
# we use this function to sample a new list from old list by given length, and aintain uniformity, for example:
1044+
# original: [1 8 16 24 32 40 48 56 64]
1045+
# --> sample length = 3: [1 32 64]
1046+
# --> sample length = 5: [1 16 32 48 56]
1047+
original_len = len(self.npugraph_batch_sizes)
1048+
step = (original_len - 1) / (sample_len - 1)
1049+
indices = [round(i * step) for i in range(sample_len)]
1050+
# Align first and last element of the original list and sub-list
1051+
indices[0] = 0
1052+
indices[-1] = original_len - 1
1053+
# Sample new list
1054+
new_list = [self.npugraph_batch_sizes[i] for i in indices]
1055+
return new_list

0 commit comments

Comments
 (0)