Skip to content

Commit 7f5f6b6

Browse files
Merge branch 'main' into main
2 parents 3f0ccc4 + 3724107 commit 7f5f6b6

File tree

90 files changed

+3395
-713
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

90 files changed

+3395
-713
lines changed

.buildkite/check-wheel-size.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55
import sys
66
import zipfile
77

8-
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
9-
# Note that we have 400 MiB quota, please use it wisely.
10-
# See https://github.com/pypi/support/issues/3792 .
8+
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
9+
# Note that we have 800 MiB quota, please use it wisely.
10+
# See https://github.com/pypi/support/issues/6326 .
1111
# Please also sync the value with the one in Dockerfile.
12-
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
12+
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
1313

1414

1515
def print_top_10_largest_files(zip_file):

.buildkite/scripts/hardware_ci/run-xpu-test.sh

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,11 @@ docker run \
3030
bash -c '
3131
set -e
3232
echo $ZE_AFFINITY_MASK
33-
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
34-
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
35-
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
36-
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
33+
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
34+
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
35+
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
36+
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
37+
VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
3738
cd tests
3839
pytest -v -s v1/core
3940
pytest -v -s v1/engine

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ Easy, fast, and cheap LLM serving for everyone
1818

1919
*Latest News* 🔥
2020

21+
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
2122
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
2223
- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
2324
- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).

benchmarks/auto_tune/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@ cd vllm
3131

3232
You must set the following variables at the top of the script before execution.
3333

34+
Note: You can also override the default values below via environment variables when running the script.
35+
36+
```bash
37+
MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
38+
```
39+
3440
| Variable | Description | Example Value |
3541
| --- | --- | --- |
3642
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |

benchmarks/auto_tune/auto_tune.sh

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,41 @@
55

66
TAG=$(date +"%Y_%m_%d_%H_%M")
77
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
8-
BASE="$SCRIPT_DIR/../../.."
9-
MODEL="meta-llama/Llama-3.1-8B-Instruct"
10-
SYSTEM="TPU"
11-
TP=1
12-
DOWNLOAD_DIR=""
13-
INPUT_LEN=4000
14-
OUTPUT_LEN=16
15-
MAX_MODEL_LEN=4096
16-
MIN_CACHE_HIT_PCT=0
17-
MAX_LATENCY_ALLOWED_MS=100000000000
18-
NUM_SEQS_LIST="128 256"
19-
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
8+
VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
9+
BASE=${BASE:-"$SCRIPT_DIR/../../.."}
10+
MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
11+
SYSTEM=${SYSTEM:-"TPU"}
12+
TP=${TP:-1}
13+
DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
14+
INPUT_LEN=${INPUT_LEN:-4000}
15+
OUTPUT_LEN=${OUTPUT_LEN:-16}
16+
MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
17+
MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
18+
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
19+
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
20+
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
2021

2122
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
2223
RESULT="$LOG_FOLDER/result.txt"
2324
PROFILE_PATH="$LOG_FOLDER/profile"
2425

25-
echo "result file: $RESULT"
26-
echo "model: $MODEL"
26+
echo "====================== AUTO TUNE PARAMETERS ===================="
27+
echo "SCRIPT_DIR=$SCRIPT_DIR"
28+
echo "BASE=$BASE"
29+
echo "MODEL=$MODEL"
30+
echo "SYSTEM=$SYSTEM"
31+
echo "TP=$TP"
32+
echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
33+
echo "INPUT_LEN=$INPUT_LEN"
34+
echo "OUTPUT_LEN=$OUTPUT_LEN"
35+
echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
36+
echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
37+
echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
38+
echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
39+
echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
40+
echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
41+
echo "RESULT_FILE=$RESULT"
42+
echo "====================== AUTO TUNEPARAMETERS ===================="
2743

2844
rm -rf $LOG_FOLDER
2945
rm -rf $PROFILE_PATH
@@ -213,7 +229,7 @@ run_benchmark() {
213229

214230
pkill -if vllm
215231
sleep 10
216-
printf '=%.0s' $(seq 1 20)
232+
echo "===================="
217233
return 0
218234
}
219235

cmake/cpu_extension.cmake

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ is_avx512_disabled(AVX512_DISABLED)
8888

8989
if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
9090
message(STATUS "Apple Silicon Detected")
91+
set(APPLE_SILICON_FOUND TRUE)
9192
set(ENABLE_NUMA OFF)
9293
check_sysctl(hw.optional.neon ASIMD_FOUND)
9394
check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
@@ -189,7 +190,7 @@ else()
189190
set(USE_ACL OFF)
190191
endif()
191192

192-
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
193+
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
193194
FetchContent_Declare(
194195
oneDNN
195196
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git

cmake/external_projects/vllm_flash_attn.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ else()
3838
FetchContent_Declare(
3939
vllm-flash-attn
4040
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
41-
GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f
41+
GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
4242
GIT_PROGRESS TRUE
4343
# Don't share the vllm-flash-attn build between build types
4444
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

csrc/cpu/dnnl_helper.cpp

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,23 @@ void release_dnnl_matmul_handler(int64_t handler) {
2222
delete ptr;
2323
}
2424

25+
DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
26+
this->realloc(allocation_unit * 128);
27+
}
28+
29+
void DNNLScratchPadManager::realloc(size_t new_size) {
30+
new_size = round(new_size);
31+
if (new_size > size_) {
32+
ptr_ = std::aligned_alloc(64, new_size);
33+
size_ = new_size;
34+
}
35+
}
36+
37+
DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
38+
static DNNLScratchPadManager manager;
39+
return &manager;
40+
}
41+
2542
template <typename KT, typename VT>
2643
class DNNLPrimitiveCache {
2744
public:
@@ -166,6 +183,23 @@ struct hash<W8A8MatMulPrimitiveHandler::MSizeCacheKey> {
166183
hash<int>()(static_cast<int>(val.bias_type));
167184
}
168185
};
186+
187+
template <>
188+
struct hash<MatMulPrimitiveHandler::ClassMatmulCacheKey> {
189+
size_t operator()(
190+
const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
191+
return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size);
192+
}
193+
};
194+
195+
template <>
196+
struct hash<MatMulPrimitiveHandler::MSizeCacheKey> {
197+
size_t operator()(const MatMulPrimitiveHandler::MSizeCacheKey& val) const {
198+
return hash<dnnl_dim_t>()(val.a_m_size) ^
199+
hash<dnnl_dim_t>()(val.a_m_stride) ^ hash<bool>()(val.use_bias) ^
200+
hash<int>()(static_cast<int>(val.bias_type));
201+
}
202+
};
169203
} // namespace std
170204

171205
bool operator==(const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
@@ -181,6 +215,17 @@ bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l,
181215
l.bias_type == r.bias_type;
182216
}
183217

218+
bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
219+
const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
220+
return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size;
221+
}
222+
223+
bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l,
224+
const MatMulPrimitiveHandler::MSizeCacheKey& r) {
225+
return l.a_m_size == r.a_m_size && l.a_m_stride == r.a_m_stride &&
226+
l.use_bias == r.use_bias && l.bias_type == r.bias_type;
227+
}
228+
184229
static std::shared_ptr<W8A8MatMulPrimitiveHandler::MSizeCache>
185230
get_w8a8_class_primitive_cache(
186231
const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
@@ -239,6 +284,11 @@ void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) {
239284
}
240285

241286
dnnl::matmul matmul = get_matmul_cache(args);
287+
288+
auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5);
289+
scratchpad_storage->set_data_handle(
290+
DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
291+
242292
matmul.execute(default_stream(), memory_cache_);
243293
default_stream().wait();
244294
}
@@ -257,6 +307,8 @@ dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache(
257307

258308
return m_size_cache_->get_or_create(key, [&]() {
259309
dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
310+
auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
311+
manager->realloc(desc.scratchpad_desc().get_size());
260312
return dnnl::matmul(desc);
261313
});
262314
}
@@ -300,6 +352,11 @@ void W8A8MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
300352
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
301353
default_engine(), nullptr);
302354
set_runtime_memory_ptr(4, memory_cache_[DNNL_ARG_BIAS].get());
355+
356+
memory_cache_[DNNL_ARG_SCRATCHPAD] =
357+
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
358+
default_engine(), nullptr);
359+
set_runtime_memory_ptr(5, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
303360
}
304361

305362
dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
@@ -319,6 +376,9 @@ dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
319376
dnnl::memory::format_tag::ab);
320377

321378
dnnl::primitive_attr attr;
379+
380+
attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
381+
322382
// For PER_TOKEN, scales will be applied in outside epilogue
323383
if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
324384
attr.set_scales_mask(DNNL_ARG_SRC, 0);
@@ -344,3 +404,120 @@ dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
344404
attr);
345405
}
346406
}
407+
408+
MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
409+
: DNNLMatMulPrimitiveHandler(
410+
static_cast<DNNLMatMulPrimitiveHandler::Args>(args), args.ab_type),
411+
m_size_cache_(nullptr) {
412+
assert(ab_type_ == dnnl::memory::data_type::f32 ||
413+
ab_type_ == dnnl::memory::data_type::bf16 ||
414+
ab_type_ == dnnl::memory::data_type::f16);
415+
prepack_weight(args.b_ptr,
416+
create_primitive_desc(
417+
MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
418+
.a_m_stride = DNNL_RUNTIME_DIM_VAL,
419+
.use_bias = false,
420+
.bias_type = dnnl::memory::data_type::undef},
421+
true)
422+
.weights_desc());
423+
init_runtime_memory_cache(args);
424+
}
425+
426+
static std::shared_ptr<MatMulPrimitiveHandler::MSizeCache>
427+
get_matul_class_primitive_cache(
428+
const MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
429+
int64_t cache_size) {
430+
static MatMulPrimitiveHandler::ClassMatmulCache cache(128);
431+
assert(cache_size > 0);
432+
return cache.get_or_create(key, [&]() {
433+
return std::make_shared<MatMulPrimitiveHandler::MSizeCache>(cache_size);
434+
});
435+
}
436+
437+
void MatMulPrimitiveHandler::execute(ExecArgs& args) {
438+
auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0);
439+
auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1);
440+
a_storage->set_data_handle((void*)args.a_ptr);
441+
a_mem_desc->dims[0] = args.a_m_size;
442+
a_mem_desc->format_desc.blocking.strides[0] = args.a_m_stride;
443+
c_storage->set_data_handle((void*)args.c_ptr);
444+
c_mem_desc->dims[0] = args.a_m_size;
445+
446+
if (args.use_bias) {
447+
auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
448+
bias_storage->set_data_handle((void*)args.bias_ptr);
449+
}
450+
451+
dnnl::matmul matmul = get_matmul_cache(args);
452+
453+
auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
454+
scratchpad_storage->set_data_handle(
455+
DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
456+
457+
matmul.execute(default_stream(), memory_cache_);
458+
default_stream().wait();
459+
}
460+
461+
dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
462+
const MSizeCacheKey& key) {
463+
if (m_size_cache_.get() == nullptr) {
464+
ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_};
465+
m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_);
466+
}
467+
return m_size_cache_->get_or_create(key, [&]() {
468+
dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
469+
auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
470+
manager->realloc(desc.scratchpad_desc().get_size());
471+
return dnnl::matmul(desc);
472+
});
473+
}
474+
475+
dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
476+
const MSizeCacheKey& key, bool first_time) {
477+
dnnl::memory::desc a_md;
478+
dnnl::memory::desc b_md;
479+
if (first_time) {
480+
a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
481+
dnnl::memory::format_tag::ab);
482+
b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
483+
dnnl::memory::format_tag::any);
484+
} else {
485+
a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
486+
{key.a_m_stride, 1});
487+
b_md = b_target_mem_desc_;
488+
}
489+
dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
490+
dnnl::memory::format_tag::ab);
491+
492+
dnnl::primitive_attr attr;
493+
attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
494+
495+
if (key.use_bias) {
496+
dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
497+
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
498+
c_md, attr);
499+
} else {
500+
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
501+
attr);
502+
}
503+
}
504+
505+
void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
506+
memory_cache_[DNNL_ARG_SRC] = dnnl::memory(
507+
{{1, b_k_size_}, b_type_, {b_k_size_, 1}}, default_engine(), nullptr);
508+
set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get());
509+
memory_cache_[DNNL_ARG_DST] =
510+
dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab},
511+
default_engine(), nullptr);
512+
set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
513+
514+
memory_cache_[DNNL_ARG_BIAS] =
515+
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
516+
default_engine(), nullptr);
517+
set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
518+
519+
memory_cache_[DNNL_ARG_SCRATCHPAD] =
520+
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
521+
default_engine(), nullptr);
522+
set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
523+
}

0 commit comments

Comments
 (0)