Skip to content

Commit 59e4598

Browse files
authored
Merge branch 'main' into fix-fla-crash-on-plugin
2 parents b483cc9 + a4a4f0f commit 59e4598

File tree

346 files changed

+10779
-3441
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

346 files changed

+10779
-3441
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ steps:
2222
agents:
2323
queue: arm64_cpu_queue_postmerge
2424
commands:
25-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile.cpu ."
25+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
2626
- "mkdir artifacts"
2727
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
2828
- "bash .buildkite/scripts/upload-wheels.sh"

.buildkite/test-amd.yaml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ steps:
5050

5151
- label: Async Engine, Inputs, Utils, Worker Test # 36min
5252
timeout_in_minutes: 50
53-
mirror_hardwares: [amdexperimental]
53+
mirror_hardwares: [amdexperimental, amdproduction]
5454
agent_pool: mi325_1
5555
# grade: Blocking
5656
source_file_dependencies:
@@ -561,7 +561,7 @@ steps:
561561

562562
- label: Model Executor Test # 23min
563563
timeout_in_minutes: 35
564-
mirror_hardwares: [amdexperimental]
564+
mirror_hardwares: [amdexperimental, amdproduction]
565565
agent_pool: mi325_1
566566
# grade: Blocking
567567
source_file_dependencies:
@@ -789,8 +789,10 @@ steps:
789789
- vllm/
790790
- tests/models/language/generation
791791
commands:
792-
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
793-
- pip install 'git+https://github.com/Dao-AILab/[email protected]'
792+
# Install fast path packages for testing against transformers
793+
# Note: also needed to run plamo2 model in vLLM
794+
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/[email protected]'
795+
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/[email protected]'
794796
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
795797

796798
- label: Language Models Test (PPL)

.buildkite/test-pipeline.yaml

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,15 @@ steps:
313313
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
314314
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
315315

316+
- label: V1 Test attention (H100) # 10min
317+
timeout_in_minutes: 30
318+
gpu: h100
319+
source_file_dependencies:
320+
- vllm/v1/attention
321+
- tests/v1/attention
322+
commands:
323+
- pytest -v -s v1/attention
324+
316325
- label: V1 Test others (CPU) # 5 mins
317326
source_file_dependencies:
318327
- vllm/
@@ -435,6 +444,18 @@ steps:
435444
- pytest -v -s compile/test_full_graph.py
436445
- pytest -v -s compile/test_fusions_e2e.py
437446

447+
- label: Cudagraph test
448+
timeout_in_minutes: 20
449+
mirror_hardwares: [amdexperimental]
450+
source_file_dependencies:
451+
- tests/v1/cudagraph
452+
- vllm/v1/cudagraph_dispatcher.py
453+
- vllm/config/compilation.py
454+
- vllm/compilation
455+
commands:
456+
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
457+
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
458+
438459
- label: Kernels Core Operation Test # 48min
439460
timeout_in_minutes: 75
440461
mirror_hardwares: [amdexperimental]
@@ -687,8 +708,10 @@ steps:
687708
- vllm/
688709
- tests/models/language/generation
689710
commands:
690-
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
691-
- pip install 'git+https://github.com/Dao-AILab/[email protected]'
711+
# Install fast path packages for testing against transformers
712+
# Note: also needed to run plamo2 model in vLLM
713+
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/[email protected]'
714+
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/[email protected]'
692715
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
693716

694717
- label: Language Models Test (PPL)

benchmarks/benchmark_block_pool.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from benchmark_utils import TimeCollector
66
from tabulate import tabulate
77

8-
from vllm.utils import FlexibleArgumentParser
8+
from vllm.utils.argparse_utils import FlexibleArgumentParser
99
from vllm.v1.core.block_pool import BlockPool
1010

1111

benchmarks/benchmark_long_document_qa_throughput.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646

4747
from vllm import LLM, SamplingParams
4848
from vllm.engine.arg_utils import EngineArgs
49-
from vllm.utils import FlexibleArgumentParser
49+
from vllm.utils.argparse_utils import FlexibleArgumentParser
5050

5151

5252
def test_long_document_qa(llm=None, sampling_params=None, prompts=None):

benchmarks/benchmark_ngram_proposer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
VllmConfig,
2020
)
2121
from vllm.platforms import current_platform
22-
from vllm.utils import FlexibleArgumentParser
22+
from vllm.utils.argparse_utils import FlexibleArgumentParser
2323
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
2424
from vllm.v1.worker.gpu_input_batch import InputBatch
2525
from vllm.v1.worker.gpu_model_runner import GPUModelRunner

benchmarks/benchmark_prefix_caching.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737

3838
from vllm import LLM, SamplingParams
3939
from vllm.engine.arg_utils import EngineArgs
40-
from vllm.utils import FlexibleArgumentParser
40+
from vllm.utils.argparse_utils import FlexibleArgumentParser
4141

4242
try:
4343
from vllm.transformers_utils.tokenizer import get_tokenizer

benchmarks/benchmark_prioritization.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from transformers import AutoTokenizer, PreTrainedTokenizerBase
1212

1313
from vllm.engine.arg_utils import EngineArgs
14-
from vllm.utils import FlexibleArgumentParser
14+
from vllm.utils.argparse_utils import FlexibleArgumentParser
1515

1616

1717
# Select a equi-probable random priority

benchmarks/benchmark_serving_structured_output.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
from backend_request_func import get_tokenizer
5252

5353
try:
54-
from vllm.utils import FlexibleArgumentParser
54+
from vllm.utils.argparse_utils import FlexibleArgumentParser
5555
except ImportError:
5656
from argparse import ArgumentParser as FlexibleArgumentParser
5757

benchmarks/cutlass_benchmarks/sparse_benchmarks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from weight_shapes import WEIGHT_SHAPES
1616

1717
from vllm import _custom_ops as ops
18-
from vllm.utils import FlexibleArgumentParser
18+
from vllm.utils.argparse_utils import FlexibleArgumentParser
1919

2020
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
2121
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]

0 commit comments

Comments
 (0)