vllm-project
diff --git a/‎docs/deployment/docker.md‎
Lines changed: 20 additions & 1 deletion b/‎docs/deployment/docker.md‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎docs/serving/data_parallel_deployment.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/serving/data_parallel_deployment.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/serving/expert_parallel_deployment.md‎
Lines changed: 21 additions & 1 deletion b/‎docs/serving/expert_parallel_deployment.md‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎docs/serving/parallelism_scaling.md‎
Lines changed: 23 additions & 1 deletion b/‎docs/serving/parallelism_scaling.md‎
Lines changed: 23 additions & 1 deletion
diff --git a/‎tests/test_inputs.py‎
Lines changed: 9 additions & 2 deletions b/‎tests/test_inputs.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎tests/tokenizers_/test_basic.py‎
Lines changed: 24 additions & 23 deletions b/‎tests/tokenizers_/test_basic.py‎
Lines changed: 24 additions & 23 deletions
diff --git a/‎tests/tokenizers_/test_registry.py‎
Lines changed: 21 additions & 2 deletions b/‎tests/tokenizers_/test_registry.py‎
Lines changed: 21 additions & 2 deletions
diff --git a/‎vllm/compilation/decorators.py‎
Lines changed: 22 additions & 3 deletions b/‎vllm/compilation/decorators.py‎
Lines changed: 22 additions & 3 deletions
diff --git a/‎vllm/distributed/kv_transfer/kv_connector/v1/metrics.py‎
Lines changed: 0 additions & 3 deletions b/‎vllm/distributed/kv_transfer/kv_connector/v1/metrics.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎vllm/entrypoints/chat_utils.py‎
Lines changed: 3 additions & 2 deletions b/‎vllm/entrypoints/chat_utils.py‎
Lines changed: 3 additions & 2 deletions
@@ -82,7 +82,7 @@ DOCKER_BUILDKIT=1 docker build . \
 
 ## Building for Arm64/aarch64
 
-A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper and Grace-Blackwell. Using the flag `--platform "linux/arm64"` will build for arm64.
 
 !!! note
     Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
@@ -104,6 +104,25 @@ A docker container can be built for aarch64 systems such as the Nvidia Grace-Hop
     --build-arg RUN_WHEEL_CHECK=false
     ```
 
+For (G)B300, we recommend using CUDA 13, as shown in the following command.
+
+??? console "Command"
+
+    ```bash
+    DOCKER_BUILDKIT=1 docker build \
+    --build-arg CUDA_VERSION=13.0.1 \
+    --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 \
+    --build-arg max_jobs=256 \
+    --build-arg nvcc_threads=2 \
+    --build-arg RUN_WHEEL_CHECK=false \
+    --build-arg torch_cuda_arch_list='9.0 10.0+PTX' \
+    --platform "linux/arm64" \
+    --tag vllm/vllm-gb300-openai:latest \
+    --target vllm-openai \
+    -f docker/Dockerfile \
+    .
+    ```
+
 !!! note
     If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.
 
 
@@ -8,11 +8,11 @@ For MoE models, particularly those like DeepSeek that employ MLA (Multi-head Lat
 
 In these cases, the data parallel ranks are not completely independent. Forward passes must be aligned, and expert layers across all ranks are required to synchronize during every forward pass, even when there are fewer requests to be processed than DP ranks.
 
-The expert layers will by default form a (DP x TP) sized tensor parallel group. To enable expert parallelism, include the `--enable-expert-parallel` CLI arg (on all nodes in the multi-node case).
+By default, expert layers form a tensor parallel group of size `DP × TP`. To use expert parallelism instead, include the `--enable-expert-parallel` CLI arg (on all nodes in the multi-node case). See [Expert Parallel Deployment](expert_parallel_deployment.md) for details on how attention and expert layers behave differently with EP enabled.
 
 In vLLM, each DP rank is deployed as a separate "core engine" process that communicates with front-end process(es) via ZMQ sockets. Data Parallel attention can be combined with Tensor Parallel attention, in which case each DP engine owns a number of per-GPU worker processes equal to the configured TP size.
 
-For MoE models, when any requests are in progress in any rank, we must ensure that empty "dummy" forward passes are performed in all ranks that don't currently have any requests scheduled. This is handled via a separate DP Coordinator process that communicates with all ranks, and a collective operation performed every N steps to determine when all ranks become idle and can be paused. When TP is used in conjunction with DP, expert layers form an EP or TP group of size (DP x TP).
+For MoE models, when any requests are in progress in any rank, we must ensure that empty "dummy" forward passes are performed in all ranks that don't currently have any requests scheduled. This is handled via a separate DP Coordinator process that communicates with all ranks, and a collective operation performed every N steps to determine when all ranks become idle and can be paused. When TP is used in conjunction with DP, expert layers form a group of size `DP × TP` (using either tensor parallelism by default, or expert parallelism if `--enable-expert-parallel` is set).
 
 In all cases, it is beneficial to load-balance requests between DP ranks. For online deployments, this balancing can be optimized by taking into account the state of each DP engine - in particular its currently scheduled and waiting (queued) requests, and KV cache state. Each DP engine has an independent KV cache, and the benefit of prefix caching can be maximized by directing prompts intelligently.
 
 
@@ -44,7 +44,27 @@ Where:
 - `DP_SIZE`: Data parallel size
 - `EP_SIZE`: Expert parallel size (computed automatically)
 
-When EP is enabled, MoE layers use expert parallelism instead of tensor parallelism, while attention layers continue to use tensor parallelism if `TP_SIZE > 1`.
+### Layer Behavior with EP Enabled
+
+When EP is enabled, different layers in MoE models behave differently:
+
+| Layer Type | Behavior | Parallelism Used |
+|------------|----------|------------------|
+| **Expert (MoE) Layers** | Sharded across all EP ranks | Expert Parallel (EP) of size `TP × DP` |
+| **Attention Layers** | Behavior depends on TP size | See below |
+
+**Attention layer parallelism:**
+
+- **When `TP = 1`**: Attention weights are **replicated** across all DP ranks (data parallelism)
+- **When `TP > 1`**: Attention weights are **sharded** using tensor parallelism across TP ranks within each DP group
+
+For example, with `TP=2, DP=4` (8 GPUs total):
+
+- Expert layers form an EP group of size 8, with experts distributed across all GPUs
+- Attention layers use TP=2 within each of the 4 DP groups
+
+!!! note "Key Difference from Data Parallel Deployment"
+    Without `--enable-expert-parallel`, MoE layers would use tensor parallelism (forming a TP group of size `TP × DP`), similar to dense models. With EP enabled, expert layers switch to expert parallelism, which can provide better efficiency and locality for MoE models.
 
 ### Example Command
 
 
@@ -62,7 +62,7 @@ If a single node lacks sufficient GPUs to hold the model, deploy vLLM across mul
 
 ### What is Ray?
 
-Ray is a distributed computing framework for scaling Python programs. Multi-node vLLM deployments require Ray as the runtime engine.
+Ray is a distributed computing framework for scaling Python programs. Multi-node vLLM deployments can use Ray as the runtime engine.
 
 vLLM uses Ray to manage the distributed execution of tasks across multiple nodes and control where execution happens.
 
@@ -130,6 +130,28 @@ vllm serve /path/to/the/model/in/the/container \
      --distributed-executor-backend ray
 ```
 
+### Running vLLM with MultiProcessing
+
+Besides Ray, Multi-node vLLM deployments can also use `multiprocessing` as the runtime engine. Here's an example to deploy model across 2 nodes (8 GPUs per node) with `tp_size=8` and `pp_size=2`.
+
+Choose one node as the head node and run:
+
+```bash
+vllm serve /path/to/the/model/in/the/container \
+  --tensor-parallel-size 8 --pipeline-parallel-size 2 \
+  --nnodes 2 --node-rank 0 \
+  --master-addr <HEAD_NODE_IP>
+```
+
+On the other worker node, run:
+
+```bash
+vllm serve /path/to/the/model/in/the/container \
+  --tensor-parallel-size 8 --pipeline-parallel-size 2 \
+  --nnodes 2 --node-rank 1 \
+  --master-addr <HEAD_NODE_IP> --headless
+```
+
 ## Optimizing network communication for tensor parallelism
 
 Efficient tensor parallelism requires fast inter-node communication, preferably through high-speed network adapters such as InfiniBand.
 
@@ -7,7 +7,7 @@
 from vllm.inputs import zip_enc_dec_prompts
 from vllm.inputs.parse import parse_raw_prompts
 from vllm.inputs.preprocess import InputPreprocessor
-from vllm.tokenizers import init_tokenizer_from_config
+from vllm.tokenizers import cached_tokenizer_from_config
 
 pytestmark = pytest.mark.cpu_test
 
@@ -34,6 +34,13 @@
 ]
 
 
+# Test that a nested mixed-type list of lists raises a TypeError.
+@pytest.mark.parametrize("invalid_input", [[[1, 2], ["foo", "bar"]]])
+def test_invalid_input_raise_type_error(invalid_input):
+    with pytest.raises(TypeError):
+        parse_raw_prompts(invalid_input)
+
+
 def test_parse_raw_single_batch_empty():
     with pytest.raises(ValueError, match="at least one prompt"):
         parse_raw_prompts([])
@@ -108,7 +115,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
 )
 def test_preprocessor_always_mm_code_path(model_id, prompt):
     model_config = ModelConfig(model=model_id)
-    tokenizer = init_tokenizer_from_config(model_config)
+    tokenizer = cached_tokenizer_from_config(model_config)
     input_preprocessor = InputPreprocessor(model_config, tokenizer)
 
     # HF processor adds sep token
 
@@ -3,38 +3,39 @@
 from typing import _get_protocol_attrs  # type: ignore
 
 import pytest
-from transformers import PreTrainedTokenizerBase
+from transformers import (
+    PreTrainedTokenizer,
+    PreTrainedTokenizerBase,
+    PreTrainedTokenizerFast,
+)
 
 from vllm.tokenizers import TokenizerLike, get_tokenizer
+from vllm.tokenizers.mistral import MistralTokenizer
 
 
 def _get_missing_attrs(obj: object, target: type):
     return [k for k in _get_protocol_attrs(target) if not hasattr(obj, k)]
 
 
+def _assert_tokenizer_like(tokenizer: object):
+    missing_attrs = _get_missing_attrs(tokenizer, TokenizerLike)
+    assert not missing_attrs, f"Missing attrs: {missing_attrs}"
+
+
 def test_tokenizer_like_protocol():
-    assert not (
-        missing_attrs := _get_missing_attrs(
-            get_tokenizer("gpt2", use_fast=False),
-            TokenizerLike,
-        )
-    ), f"Missing attrs: {missing_attrs}"
-
-    assert not (
-        missing_attrs := _get_missing_attrs(
-            get_tokenizer("gpt2", use_fast=True),
-            TokenizerLike,
-        )
-    ), f"Missing attrs: {missing_attrs}"
-
-    assert not (
-        missing_attrs := _get_missing_attrs(
-            get_tokenizer(
-                "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
-            ),
-            TokenizerLike,
-        )
-    ), f"Missing attrs: {missing_attrs}"
+    tokenizer = get_tokenizer("gpt2", use_fast=False)
+    assert isinstance(tokenizer, PreTrainedTokenizer)
+    _assert_tokenizer_like(tokenizer)
+
+    tokenizer = get_tokenizer("gpt2", use_fast=True)
+    assert isinstance(tokenizer, PreTrainedTokenizerFast)
+    _assert_tokenizer_like(tokenizer)
+
+    tokenizer = get_tokenizer(
+        "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
+    )
+    assert isinstance(tokenizer, MistralTokenizer)
+    _assert_tokenizer_like(tokenizer)
 
 
 @pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
 
@@ -2,7 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 
-from vllm.tokenizers import TokenizerLike, TokenizerRegistry, get_tokenizer
+import pytest
+
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.registry import (
+    TokenizerRegistry,
+    get_tokenizer,
+    resolve_tokenizer_args,
+)
 
 
 class TestTokenizer(TokenizerLike):
@@ -40,10 +47,22 @@ def is_fast(self) -> bool:
         return True
 
 
+@pytest.mark.parametrize("runner_type", ["generate", "pooling"])
+def test_resolve_tokenizer_args_idempotent(runner_type):
+    tokenizer_mode, tokenizer_name, args, kwargs = resolve_tokenizer_args(
+        "facebook/opt-125m",
+        runner_type=runner_type,
+    )
+
+    assert (tokenizer_mode, tokenizer_name, args, kwargs) == resolve_tokenizer_args(
+        tokenizer_name, *args, **kwargs
+    )
+
+
 def test_customized_tokenizer():
     TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)
 
-    tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer", "abc")
+    tokenizer = TokenizerRegistry.load_tokenizer("test_tokenizer", "abc")
     assert isinstance(tokenizer, TestTokenizer)
     assert tokenizer.path_or_repo_id == "abc"
     assert tokenizer.bos_token_id == 0
 
@@ -28,7 +28,7 @@
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 from vllm.utils.import_utils import resolve_obj_by_qualname
-from vllm.utils.torch_utils import supports_dynamo
+from vllm.utils.torch_utils import is_torch_equal_or_newer, supports_dynamo
 
 from .monitor import start_monitoring_torch_compile
 
@@ -316,7 +316,13 @@ def __init__(
     def _mark_dynamic_inputs(mod, type, *args, **kwargs):
         def mark_dynamic(arg, dims):
             if type == DynamicShapesType.UNBACKED:
-                torch._dynamo.decorators.mark_unbacked(arg, dims)
+                if is_torch_equal_or_newer("2.10.0.dev"):
+                    for dim in dims:
+                        torch._dynamo.decorators.mark_unbacked(
+                            arg, dim, hint_override=arg.size()[dim]
+                        )
+                else:
+                    torch._dynamo.decorators.mark_unbacked(arg, dims)
             else:
                 torch._dynamo.mark_dynamic(arg, dims)
 
@@ -350,7 +356,13 @@ def mark_dynamic(arg, dims):
                     if isinstance(arg, torch.Tensor):
                         # In case dims is specified with negative indexing
                         dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
-                        torch._dynamo.decorators.mark_unbacked(arg, dims)
+                        if is_torch_equal_or_newer("2.10.0.dev"):
+                            for dim in dims:
+                                torch._dynamo.decorators.mark_unbacked(
+                                    arg, dim, hint_override=arg.size()[dim]
+                                )
+                        else:
+                            torch._dynamo.decorators.mark_unbacked(arg, dims)
 
     def __call__(self, *args, **kwargs):
         # torch.compiler.is_compiling() means we are inside the compilation
@@ -488,6 +500,12 @@ def patched_inline_call(self_):
         if ds_type == DynamicShapesType.BACKED_SIZE_OBLIVIOUS:
             fx_config_patches["backed_size_oblivious"] = True
 
+        # Prepare inductor config patches
+        # assume_32bit_indexing is only available in torch 2.10.0.dev+
+        inductor_config_patches = {}
+        if is_torch_equal_or_newer("2.10.0.dev"):
+            inductor_config_patches["assume_32bit_indexing"] = True
+
         with (
             patch.object(
                 InliningInstructionTranslator, "inline_call_", patched_inline_call
@@ -496,6 +514,7 @@ def patched_inline_call(self_):
             maybe_use_cudagraph_partition_wrapper(self.vllm_config),
             torch.fx.experimental._config.patch(**fx_config_patches),
             _torch27_patch_tensor_subclasses(),
+            torch._inductor.config.patch(**inductor_config_patches),
         ):
             if envs.VLLM_USE_AOT_COMPILE:
                 self.aot_compiled_fn = self.aot_compile(*args, **kwargs)
 
@@ -7,7 +7,6 @@
 
 from vllm.config import KVTransferConfig, VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
-from vllm.distributed.kv_transfer.kv_transfer_state import has_kv_transfer_group
 from vllm.logger import init_logger
 
 PromMetric: TypeAlias = Gauge | Counter | Histogram
@@ -53,8 +52,6 @@ def is_empty(self) -> bool:
 
 class KVConnectorLogging:
     def __init__(self, kv_transfer_config: KVTransferConfig | None):
-        # This should be called on frontend process.
-        assert not has_kv_transfer_group()
         # Instantiate the connector's stats class.
         if kv_transfer_config and kv_transfer_config.kv_connector:
             self.connector_cls = KVConnectorFactory.get_connector_class(
 
@@ -50,7 +50,6 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
 from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector
 from vllm.tokenizers import TokenizerLike
-from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.utils import random_uuid
@@ -60,6 +59,8 @@
 
 if TYPE_CHECKING:
     import torch
+
+    from vllm.tokenizers.mistral import MistralTokenizer
 else:
     torch = LazyLoader("torch", globals(), "torch")
 
@@ -1832,7 +1833,7 @@ def apply_hf_chat_template(
 
 
 def apply_mistral_chat_template(
-    tokenizer: MistralTokenizer,
+    tokenizer: "MistralTokenizer",
     messages: list[ChatCompletionMessageParam],
     chat_template: str | None,
     tools: list[dict[str, Any]] | None,