vllm-project
diff --git a/‎csrc/mamba/mamba_ssm/selective_scan_fwd.cu‎
Lines changed: 47 additions & 25 deletions b/‎csrc/mamba/mamba_ssm/selective_scan_fwd.cu‎
Lines changed: 47 additions & 25 deletions
diff --git a/‎docs/configuration/optimization.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/configuration/optimization.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/deployment/frameworks/lws.md‎
Lines changed: 2 additions & 4 deletions b/‎docs/deployment/frameworks/lws.md‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎examples/online_serving/multi-node-serving.sh‎
Lines changed: 1 addition & 1 deletion b/‎examples/online_serving/multi-node-serving.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/online_serving/openai_chat_completion_client_for_multimodal.py‎
Lines changed: 42 additions & 0 deletions b/‎examples/online_serving/openai_chat_completion_client_for_multimodal.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎tests/async_engine/test_api_server.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/async_engine/test_api_server.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/core/block/e2e/test_correctness.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/core/block/e2e/test_correctness.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/engine/test_arg_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/engine/test_arg_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/entrypoints/conftest.py‎
Lines changed: 29 additions & 0 deletions b/‎tests/entrypoints/conftest.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎tests/entrypoints/openai/test_chat.py‎
Lines changed: 0 additions & 2 deletions b/‎tests/entrypoints/openai/test_chat.py‎
Lines changed: 0 additions & 2 deletions
@@ -27,11 +27,12 @@
 
 template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_,
          bool kIsVariableB_, bool kIsVariableC_,
-         bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_>
+         bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_, typename state_t_>
 struct Selective_Scan_fwd_kernel_traits {
     static_assert(kNItems_ % 4 == 0);
     using input_t = input_t_;
     using weight_t = weight_t_;
+    using state_t = state_t_;
     static constexpr int kNThreads = kNThreads_;
     // Setting MinBlocksPerMP to be 3 (instead of 2) for 128 threads improves occupancy.
     static constexpr int kMinBlocks = kNThreads < 128 ? 5 : 3;
@@ -132,7 +133,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
     input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
     weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
     input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
-    input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) + 
+    typename Ktraits::state_t *ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) + 
     cache_index * params.ssm_states_batch_stride + 
     dim_id * kNRows * params.ssm_states_dim_stride;
 
@@ -261,7 +262,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                 if (threadIdx.x == 0) {
                     smem_running_prefix[state_idx] = prefix_op.running_prefix;
                     if (chunk == n_chunks - 1) {
-                        ssm_states[state_idx * params.ssm_states_dstate_stride] = input_t(prefix_op.running_prefix.y);
+                        ssm_states[state_idx * params.ssm_states_dstate_stride] = typename Ktraits::state_t(prefix_op.running_prefix.y);
                     }
                 }
                 #pragma unroll
@@ -310,7 +311,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
     }
 }
 
-template<int kNThreads, int kNItems, typename input_t, typename weight_t>
+template<int kNThreads, int kNItems, typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
     // Only kNRows == 1 is tested for now, which ofc doesn't differ from previously when we had each block
     // processing 1 row.
@@ -321,7 +322,7 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
     BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
         BOOL_SWITCH(params.z_ptr != nullptr , kHasZ, [&] {
             BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] {
-                using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t>;
+                using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t, state_t>;
                 constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
                 dim3 grid(params.batch, params.dim / kNRows);
                 auto kernel = &selective_scan_fwd_kernel<Ktraits>;
@@ -341,59 +342,78 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
     });
 }
 
-template<typename input_t, typename weight_t>
+template<typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {
 
     #ifndef USE_ROCM
         if (params.seqlen <= 128) {           
-            selective_scan_fwd_launch<32, 4, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<32, 4, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 256) {
-            selective_scan_fwd_launch<32, 8, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<32, 8, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 512) {
-            selective_scan_fwd_launch<32, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<32, 16, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 1024) {
-            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
         } else {
-            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
         }
     #else
         if (params.seqlen <= 256) {
-            selective_scan_fwd_launch<64, 4, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 4, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 512) {
-            selective_scan_fwd_launch<64, 8, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 8, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 1024) {
-            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
         } else {
-            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
         }
     #endif
 }
 
-template void selective_scan_fwd_cuda<at::BFloat16, float>(SSMParamsBase &params, cudaStream_t stream);
-template void selective_scan_fwd_cuda<at::Half, float>(SSMParamsBase &params, cudaStream_t stream);
-template void selective_scan_fwd_cuda<float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::BFloat16, float, at::BFloat16>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::BFloat16, float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::Half, float, at::Half>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::Half, float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<float, float, float>(SSMParamsBase &params, cudaStream_t stream);
 
 #define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
 
-#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...)              \
+#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, STYPE, NAME, ...)       \
     if (ITYPE == at::ScalarType::Half) {                                            \
         using input_t = at::Half;                                                   \
         using weight_t = float;                                                     \
-        __VA_ARGS__();                                                              \
+        if (STYPE == at::ScalarType::Half) {                                        \
+            using state_t = at::Half;                                               \
+            __VA_ARGS__();                                                          \
+        } else if (STYPE == at::ScalarType::Float) {                                \
+            using state_t = float;                                                  \
+            __VA_ARGS__();                                                          \
+        } else {                                                                    \
+            AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \
+        }                                                                           \
     } else if (ITYPE == at::ScalarType::BFloat16) {                                 \
         using input_t = at::BFloat16;                                               \
         using weight_t = float;                                                     \
-        __VA_ARGS__();                                                              \
+        if (STYPE == at::ScalarType::BFloat16) {                                    \
+            using state_t = at::BFloat16;                                           \
+            __VA_ARGS__();                                                          \
+        } else if (STYPE == at::ScalarType::Float) {                                \
+            using state_t = float;                                                  \
+            __VA_ARGS__();                                                          \
+        } else {                                                                    \
+            AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \
+        }                                                                           \
     } else if (ITYPE == at::ScalarType::Float)  {                                   \
         using input_t = float;                                                      \
         using weight_t = float;                                                     \
+        using state_t = float;                                                      \
         __VA_ARGS__();                                                              \
     } else {                                                                        \
         AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \
     }
 
 
-template<typename input_t, typename weight_t>
+template<typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream);
 
 void set_ssm_params_fwd(SSMParamsBase &params,
@@ -648,7 +668,9 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
 
     // Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
     at::Tensor out = delta;
-    TORCH_CHECK(ssm_states.scalar_type() == input_type);
+    // ssm_states can now be either the same as input_type or float32
+    auto state_type = ssm_states.scalar_type();
+    TORCH_CHECK(state_type == input_type || state_type == at::ScalarType::Float);
     TORCH_CHECK(ssm_states.is_cuda());
     TORCH_CHECK(ssm_states.stride(-1) == 1);
 
@@ -670,7 +692,7 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
 
     const at::cuda::OptionalCUDAGuard device_guard(device_of(u));
     auto stream = at::cuda::getCurrentCUDAStream().stream();
-    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
-        selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
+    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), ssm_states.scalar_type(), "selective_scan_fwd", [&] {
+        selective_scan_fwd_cuda<input_t, weight_t, state_t>(params, stream);
     });
 }
@@ -174,6 +174,7 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
 
 Known supported models:
 
+- GLM-4.5V GLM-4.1V (<gh-pr:23168>)
 - Kimi-VL (<gh-pr:23817>)
 - Llama4 (<gh-pr:18368>)
 - MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
 
@@ -22,7 +22,7 @@ Deploy the following yaml file `lws.yaml`
     metadata:
       name: vllm
     spec:
-      replicas: 2
+      replicas: 1
       leaderWorkerTemplate:
         size: 2
         restartPolicy: RecreateGroupOnPodRestart
@@ -41,7 +41,7 @@ Deploy the following yaml file `lws.yaml`
                   - sh
                   - -c
                   - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
-                    python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
+                    vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2"
                 resources:
                   limits:
                     nvidia.com/gpu: "8"
@@ -126,8 +126,6 @@ Should get an output similar to this:
 NAME       READY   STATUS    RESTARTS   AGE
 vllm-0     1/1     Running   0          2s
 vllm-0-1   1/1     Running   0          2s
-vllm-1     1/1     Running   0          2s
-vllm-1-1   1/1     Running   0          2s
 ```
 
 Verify that the distributed tensor-parallel inference works:
 
@@ -11,7 +11,7 @@
 # Example usage:
 # On the head node machine, start the Ray head node process and run a vLLM server.
 #   ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size=<SIZE> [<extra ray args>]  && \
-#   python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2
+#   vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2
 # 
 # On each worker node, start the Ray worker node process.
 #   ./multi-node-serving.sh worker --ray_address=<HEAD_NODE_IP> --ray_port=6379 [<extra ray args>]
 
@@ -266,10 +266,52 @@ def run_audio(model: str) -> None:
     print("Chat completion output from base64 encoded audio:", result)
 
 
+def run_multi_audio(model: str) -> None:
+    from vllm.assets.audio import AudioAsset
+
+    # Two different audios to showcase batched inference.
+    audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+    audio_url2 = AudioAsset("azacinto_foscolo").url
+    audio_base64_2 = encode_base64_content_from_url(audio_url2)
+
+    # OpenAI-compatible schema (`input_audio`)
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Are these two audios the same?"},
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            "data": audio_base64,
+                            "format": "wav",
+                        },
+                    },
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            "data": audio_base64_2,
+                            "format": "wav",
+                        },
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
+
+
 example_function_map = {
     "text-only": run_text_only,
     "single-image": run_single_image,
     "multi-image": run_multi_image,
+    "multi-audio": run_multi_audio,
     "video": run_video,
     "audio": run_audio,
 }
 
@@ -98,7 +98,7 @@ def test_api_server(api_server, distributed_executor_backend: str):
         pool.join()
 
         # check cancellation stats
-        # give it some times to update the stats
+        # give it some time to update the stats
         time.sleep(1)
 
         num_aborted_requests = requests.get(
 
@@ -439,10 +439,10 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
 @pytest.mark.parametrize("seed", [1])
 def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
                                                   test_llm_generator):
-    """Verify block manager v2 with auto prefix caching could works normal
+    """Verify block manager v2 with auto prefix caching could work normally
     even when eviction started.
     With APC enabled, all blocks are held by native block at the beginning.
-    Then blocks are managed by evictor instead. If cache hit at the evitor's
+    Then blocks are managed by evictor instead. If cache hit at the evictor's
     block, then it could be reused, or we need to recompute its kv cache.
     """
     output_len = 10
 
@@ -167,7 +167,7 @@ def test_get_kwargs():
     # dict should have json tip in help
     json_tip = "Should either be a valid JSON string or JSON keys"
     assert json_tip in kwargs["json_tip"]["help"]
-    # nested config should should construct the nested config
+    # nested config should construct the nested config
     assert kwargs["nested_config"]["type"]('{"field": 2}') == NestedConfig(2)
 
 
 
@@ -201,3 +201,32 @@ def sample_sql_statements():
 condition: column "=" number
 number: "1" | "2"
 """)
+
+
+@pytest.fixture(scope="session")
+def zephyr_lora_files():
+    """Download zephyr LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+    return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
+
+
+@pytest.fixture(scope="session")
+def zephyr_lora_added_tokens_files(zephyr_lora_files):
+    """Create zephyr LoRA files with added tokens once per test session."""
+    import shutil
+    from tempfile import TemporaryDirectory
+
+    from transformers import AutoTokenizer
+
+    tmp_dir = TemporaryDirectory()
+    tmp_model_dir = f"{tmp_dir.name}/zephyr"
+    shutil.copytree(zephyr_lora_files, tmp_model_dir)
+    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+    # Copy tokenizer to adapter and add some unique tokens
+    # 32000, 32001, 32002
+    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
+                                 special_tokens=True)
+    assert added == 3
+    tokenizer.save_pretrained(tmp_model_dir)
+    yield tmp_model_dir
+    tmp_dir.cleanup()
@@ -15,8 +15,6 @@
 from openai import BadRequestError, OpenAI
 
 from ...utils import RemoteOpenAIServer
-from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
-from .test_completion import zephyr_lora_files  # noqa: F401
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`	`# Example usage:`
`12`	`12`	`# On the head node machine, start the Ray head node process and run a vLLM server.`
`13`	`13`	`# ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size=<SIZE> [<extra ray args>] && \`
`14`		`-# python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2`
	`14`	`+# vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2`
`15`	`15`	`#`
`16`	`16`	`# On each worker node, start the Ray worker node process.`
`17`	`17`	`# ./multi-node-serving.sh worker --ray_address=<HEAD_NODE_IP> --ray_port=6379 [<extra ray args>]`