vllm-project
diff --git a/‎.buildkite/release-pipeline.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/release-pipeline.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/test-amd.yaml‎
Lines changed: 6 additions & 4 deletions b/‎.buildkite/test-amd.yaml‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 25 additions & 2 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 25 additions & 2 deletions
diff --git a/‎benchmarks/benchmark_block_pool.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_block_pool.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/benchmark_long_document_qa_throughput.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_long_document_qa_throughput.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/benchmark_ngram_proposer.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_ngram_proposer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/benchmark_prefix_caching.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_prefix_caching.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/benchmark_prioritization.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_prioritization.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/benchmark_serving_structured_output.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_serving_structured_output.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/cutlass_benchmarks/sparse_benchmarks.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/cutlass_benchmarks/sparse_benchmarks.py‎
Lines changed: 1 addition & 1 deletion
@@ -22,7 +22,7 @@ steps:
     agents:
       queue: arm64_cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
 
@@ -50,7 +50,7 @@ steps:
 
 - label: Async Engine, Inputs, Utils, Worker Test # 36min
   timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   source_file_dependencies:
@@ -561,7 +561,7 @@ steps:
 
 - label: Model Executor Test # 23min
   timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   source_file_dependencies:
@@ -789,8 +789,10 @@ steps:
   - vllm/
   - tests/models/language/generation
   commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install 'git+https://github.com/Dao-AILab/[email protected]'
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/[email protected]'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/[email protected]'
     - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 
 - label: Language Models Test (PPL)
 
@@ -313,6 +313,15 @@ steps:
     - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 
+- label: V1 Test attention (H100) # 10min
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+
 - label: V1 Test others (CPU) # 5 mins
   source_file_dependencies:
     - vllm/
@@ -435,6 +444,18 @@ steps:
   - pytest -v -s compile/test_full_graph.py
   - pytest -v -s compile/test_fusions_e2e.py
 
+- label: Cudagraph test
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - tests/v1/cudagraph
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - vllm/compilation
+  commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
+
 - label: Kernels Core Operation Test # 48min
   timeout_in_minutes: 75
   mirror_hardwares: [amdexperimental]
@@ -687,8 +708,10 @@ steps:
   - vllm/
   - tests/models/language/generation
   commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install 'git+https://github.com/Dao-AILab/[email protected]'
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/[email protected]'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/[email protected]'
     - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 
 - label: Language Models Test (PPL)
 
@@ -5,7 +5,7 @@
 from benchmark_utils import TimeCollector
 from tabulate import tabulate
 
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.core.block_pool import BlockPool
 
 
 
@@ -46,7 +46,7 @@
 
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
 
@@ -19,7 +19,7 @@
     VllmConfig,
 )
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.worker.gpu_input_batch import InputBatch
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
@@ -37,7 +37,7 @@
 
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 try:
     from vllm.transformers_utils.tokenizer import get_tokenizer
 
@@ -11,7 +11,7 @@
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
 # Select a equi-probable random priority
 
@@ -51,7 +51,7 @@
     from backend_request_func import get_tokenizer
 
 try:
-    from vllm.utils import FlexibleArgumentParser
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
 
@@ -15,7 +15,7 @@
 from weight_shapes import WEIGHT_SHAPES
 
 from vllm import _custom_ops as ops
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@`
`19`	`19`	`VllmConfig,`
`20`	`20`	`)`
`21`	`21`	`from vllm.platforms import current_platform`
`22`		`-from vllm.utils import FlexibleArgumentParser`
	`22`	`+from vllm.utils.argparse_utils import FlexibleArgumentParser`
`23`	`23`	`from vllm.v1.spec_decode.ngram_proposer import NgramProposer`
`24`	`24`	`from vllm.v1.worker.gpu_input_batch import InputBatch`
`25`	`25`	`from vllm.v1.worker.gpu_model_runner import GPUModelRunner`