Skip to content

Commit 4136ce1

Browse files
committed
[Doc]: fix typos in Python comments
Signed-off-by: Didier Durand <[email protected]>
1 parent e599e2c commit 4136ce1

File tree

12 files changed

+17
-17
lines changed

12 files changed

+17
-17
lines changed

csrc/quantization/machete/generate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -417,7 +417,7 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
417417
))
418418

419419
def prepacked_type_key(prepack_type: PrepackTypeConfig):
420-
# For now we we can just use the first accumulator type seen since
420+
# For now, we can just use the first accumulator type seen since
421421
# the tensor core shapes/layouts don't vary based on accumulator
422422
# type so we can generate less code this way
423423
return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert)

docs/getting_started/installation/cpu.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch
180180
- Offline Inference: `256 * world_size`
181181
- Online Serving: `128 * world_size`
182182

183-
vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
183+
vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommended to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
184184

185185
### Which quantization configs does vLLM CPU support?
186186

tests/models/multimodal/generation/vlm_utils/core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def run_test(
4242
tensor_parallel_size: int = 1,
4343
vllm_embeddings: Optional[torch.Tensor] = None,
4444
):
45-
"""Modality agnostic test test executor for comparing HF/vLLM outputs."""
45+
"""Modality agnostic test executor for comparing HF/vLLM outputs."""
4646
# In the case of embeddings, vLLM takes separate input tensors
4747
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
4848

vllm/distributed/device_communicators/custom_all_reduce.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def __init__(self,
6060
group: the process group to work on. If None, it will use the
6161
default process group.
6262
device: the device to bind the CustomAllreduce to. If None,
63-
it will be bind to f"cuda:{local_rank}".
63+
it will be bound to f"cuda:{local_rank}".
6464
It is the caller's responsibility to make sure each communicator
6565
is bind to a unique device, and all communicators in this group
6666
are in the same node.
@@ -158,7 +158,7 @@ def __init__(self,
158158

159159
self.disabled = False
160160
# Buffers memory are owned by this Python class and passed to C++.
161-
# Meta data composes of two parts: meta data for synchronization and a
161+
# Metadata composes of two parts: metadata for synchronization and a
162162
# temporary buffer for storing intermediate allreduce results.
163163
self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size,
164164
group=group,

vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def adjust_request(
3535
self, request: ChatCompletionRequest) -> ChatCompletionRequest:
3636
if request.tools and request.tool_choice != 'none':
3737
# do not skip special tokens because internlm use the special
38-
# tokens to indicated the start and end of the tool calls
38+
# tokens to indicate the start and end of the tool calls
3939
# information.
4040
request.skip_special_tokens = False
4141
return request
@@ -60,8 +60,8 @@ def extract_tool_calls_streaming(
6060
if '<|action_start|>' not in current_text:
6161
self.position = len(current_text)
6262
return DeltaMessage(content=delta_text)
63-
# if the tool call is sended, return a empty delta message
64-
# to make sure the finish_reason will be send correctly.
63+
# if the tool call is sended, return an empty delta message
64+
# to make sure the finish_reason will be sent correctly.
6565
if self.current_tool_id > 0:
6666
return DeltaMessage(content='')
6767

vllm/envs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1064,7 +1064,7 @@ def get_vllm_port() -> Optional[int]:
10641064
# vllm should use flashinfer fused allreduce. The variable should be a
10651065
# JSON with the following format:
10661066
# { <world size>: <max size in mb> }
1067-
# Unspecified world sizes will fallback to
1067+
# Unspecified world sizes will fall back to
10681068
# { 2: 64, 4: 1, <everything else>: 0.5 }
10691069
"VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB":
10701070
lambda: json.loads(os.getenv(

vllm/model_executor/layers/fused_moe/fused_moe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -534,7 +534,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
534534
EM = sorted_token_ids.size(0)
535535
if A.size(0) < config["BLOCK_SIZE_M"]:
536536
# optimize for small batch_size.
537-
# We assume that top_ids of each token is unique, so
537+
# We assume that top_ids of each token is unique,
538538
# so num_valid_experts <= batch_size <= BLOCK_SIZE_M,
539539
# and we can skip some invalid blocks.
540540
EM = min(sorted_token_ids.size(0),

vllm/model_executor/layers/fused_moe/layer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -710,7 +710,7 @@ def determine_expert_map(
710710

711711
# Create a tensor of size num_experts filled with -1
712712
expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
713-
# Create a expert map for the local experts
713+
# Create an expert map for the local experts
714714
start_idx = ep_rank * base_experts + min(ep_rank, remainder)
715715
expert_map[start_idx:start_idx + local_num_experts] = torch.arange(
716716
0, local_num_experts, dtype=torch.int32)
@@ -806,7 +806,7 @@ def __init__(
806806

807807
self.global_num_experts = num_experts + num_redundant_experts
808808

809-
# we padding globally so EP buffer allocation works
809+
# we are padding globally so EP buffer allocation works
810810
if quant_config and quant_config.get_name() == "mxfp4":
811811
from vllm.model_executor.layers.quantization.mxfp4 import ( # noqa: E501
812812
should_use_flashinfer_mxfp4)

vllm/model_executor/layers/quantization/gptq_marlin.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -469,7 +469,7 @@ def create_weights(
469469
)
470470
layer.register_parameter("w2_scales", w2_scales)
471471
set_weight_attrs(w2_scales, extra_weight_attrs)
472-
# dont shard the w2 scales when running act order
472+
# don't shard the w2 scales when running act order
473473
set_weight_attrs(w2_scales,
474474
{"load_full_w2": self.quant_config.desc_act})
475475
# up_proj scales
@@ -493,7 +493,7 @@ def create_weights(
493493
)
494494
layer.register_parameter("w2_qzeros", w2_qzeros)
495495
set_weight_attrs(w2_qzeros, extra_weight_attrs)
496-
# dont shard the w2 scales when running act order
496+
# don't shard the w2 scales when running act order
497497
set_weight_attrs(w2_qzeros,
498498
{"load_full_w2": self.quant_config.desc_act})
499499
w13_g_idx = torch.nn.Parameter(

vllm/v1/attention/backends/flashinfer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -687,7 +687,7 @@ def forward(
687687
else:
688688
raise ValueError(f"Unsupported output dtype: {output.dtype}")
689689

690-
# TRTLLM attn kernel requires o scale to pass as a host scalar,
690+
# TRTLLM attn kernel requires to scale to pass as a host scalar,
691691
# store the o scale as a host scalar in warmup run with cuda graph
692692
# not enabled
693693
if layer._o_scale_float is None:

0 commit comments

Comments
 (0)