[Doc]: fix typos in Python comments

didier-durand · didier-durand · commit 4136ce123c91 · 2025-09-05T06:38:46.000+02:00
Signed-off-by: Didier Durand &lt;durand.didier@gmail.com&gt;
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
@@ -417,7 +417,7 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
             ))
 
     def prepacked_type_key(prepack_type: PrepackTypeConfig):
-        # For now we we can just use the first accumulator type seen since
+        # For now, we can just use the first accumulator type seen since
         # the tensor core shapes/layouts don't vary based on accumulator
         # type so we can generate less code this way
         return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert)
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
@@ -180,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch
     - Offline Inference: `256 * world_size`
     - Online Serving: `128 * world_size`
 
-vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
+vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommended to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
 
 ### Which quantization configs does vLLM CPU support?
 
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -42,7 +42,7 @@ def run_test(
     tensor_parallel_size: int = 1,
     vllm_embeddings: Optional[torch.Tensor] = None,
 ):
-    """Modality agnostic test test executor for comparing HF/vLLM outputs."""
+    """Modality agnostic test executor for comparing HF/vLLM outputs."""
     # In the case of embeddings, vLLM takes separate input tensors
     vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
 
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -60,7 +60,7 @@ def __init__(self,
             group: the process group to work on. If None, it will use the
                 default process group.
             device: the device to bind the CustomAllreduce to. If None,
-                it will be bind to f"cuda:{local_rank}".
+                it will be bound to f"cuda:{local_rank}".
         It is the caller's responsibility to make sure each communicator
         is bind to a unique device, and all communicators in this group
         are in the same node.
@@ -158,7 +158,7 @@ def __init__(self,
 
         self.disabled = False
         # Buffers memory are owned by this Python class and passed to C++.
-        # Meta data composes of two parts: meta data for synchronization and a
+        # Metadata composes of two parts: metadata for synchronization and a
         # temporary buffer for storing intermediate allreduce results.
         self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size,
                                                    group=group,
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -35,7 +35,7 @@ def adjust_request(
             self, request: ChatCompletionRequest) -> ChatCompletionRequest:
         if request.tools and request.tool_choice != 'none':
             # do not skip special tokens because internlm use the special
-            # tokens to indicated the start and end of the tool calls
+            # tokens to indicate the start and end of the tool calls
             # information.
             request.skip_special_tokens = False
         return request
@@ -60,8 +60,8 @@ def extract_tool_calls_streaming(
         if '<|action_start|>' not in current_text:
             self.position = len(current_text)
             return DeltaMessage(content=delta_text)
-        # if the tool call is sended, return a empty delta message
-        # to make sure the finish_reason will be send correctly.
+        # if the tool call is sended, return an empty delta message
+        # to make sure the finish_reason will be sent correctly.
         if self.current_tool_id > 0:
             return DeltaMessage(content='')
 
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -1064,7 +1064,7 @@ def get_vllm_port() -> Optional[int]:
     # vllm should use flashinfer fused allreduce. The variable should be a
     # JSON with the following format:
     #     { <world size>: <max size in mb> }
-    # Unspecified world sizes will fallback to
+    # Unspecified world sizes will fall back to
     #     { 2: 64, 4: 1, <everything else>: 0.5 }
     "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB":
     lambda: json.loads(os.getenv(
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -534,7 +534,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
     EM = sorted_token_ids.size(0)
     if A.size(0) < config["BLOCK_SIZE_M"]:
         # optimize for small batch_size.
-        # We assume that top_ids of each token is unique, so
+        # We assume that top_ids of each token is unique,
         # so num_valid_experts <= batch_size <= BLOCK_SIZE_M,
         # and we can skip some invalid blocks.
         EM = min(sorted_token_ids.size(0),
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -710,7 +710,7 @@ def determine_expert_map(
 
     # Create a tensor of size num_experts filled with -1
     expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
-    # Create a expert map for the local experts
+    # Create an expert map for the local experts
     start_idx = ep_rank * base_experts + min(ep_rank, remainder)
     expert_map[start_idx:start_idx + local_num_experts] = torch.arange(
         0, local_num_experts, dtype=torch.int32)
@@ -806,7 +806,7 @@ def __init__(
 
         self.global_num_experts = num_experts + num_redundant_experts
 
-        # we padding globally so EP buffer allocation works
+        # we are padding globally so EP buffer allocation works
         if quant_config and quant_config.get_name() == "mxfp4":
             from vllm.model_executor.layers.quantization.mxfp4 import (  # noqa: E501
                 should_use_flashinfer_mxfp4)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -469,7 +469,7 @@ def create_weights(
         )
         layer.register_parameter("w2_scales", w2_scales)
         set_weight_attrs(w2_scales, extra_weight_attrs)
-        # dont shard the w2 scales when running act order
+        # don't shard the w2 scales when running act order
         set_weight_attrs(w2_scales,
                          {"load_full_w2": self.quant_config.desc_act})
         # up_proj scales
@@ -493,7 +493,7 @@ def create_weights(
         )
         layer.register_parameter("w2_qzeros", w2_qzeros)
         set_weight_attrs(w2_qzeros, extra_weight_attrs)
-        # dont shard the w2 scales when running act order
+        # don't shard the w2 scales when running act order
         set_weight_attrs(w2_qzeros,
                          {"load_full_w2": self.quant_config.desc_act})
         w13_g_idx = torch.nn.Parameter(
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
@@ -687,7 +687,7 @@ def forward(
             else:
                 raise ValueError(f"Unsupported output dtype: {output.dtype}")
 
-            # TRTLLM attn kernel requires o scale to pass as a host scalar,
+            # TRTLLM attn kernel requires to scale to pass as a host scalar,
             # store the o scale as a host scalar in warmup run with cuda graph
             # not enabled
             if layer._o_scale_float is None:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -439,7 +439,7 @@ def preprocess_add_request(
         """
         # Note on thread safety: no race condition.
         # `mm_receiver_cache` is reset at the end of LLMEngine init,
-        # and will only accessed in the input processing thread afterwards.
+        # and will only be accessed in the input processing thread afterwards.
         if self.mm_receiver_cache is not None and request.mm_features:
             request.mm_features = (
                 self.mm_receiver_cache.get_and_update_features(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -2665,7 +2665,7 @@ def freeze_gc():
         # Disable cudagraph capturing globally, so any unexpected cudagraph
         # capturing will be detected and raise an error after here.
         # Note: We don't put it into graph_capture context manager because
-        # we may doing lazy capturing in future that still allows capturing
+        # we may do lazy capturing in future that still allows capturing
         # after here.
         set_cudagraph_capturing_enabled(False)