[Perf] Optimize multi-token incremental detokenization

Mohammad Othman · Mohammad Othman · commit 6d179745821c · 2025-11-15T22:28:22.000+02:00
This commit optimizes the incremental detokenization process when
handling multiple tokens per update, which is increasingly common with
speculative decoding methods (EAGLE, Medusa, n-gram proposer, etc.).

**Problem:**
The original implementation in BaseIncrementalDetokenizer.update()
processed tokens one-by-one in a loop, calling decode_next() for each
token. This created inefficiency when speculative decoding generates
multiple tokens per step (up to 128 tokens in MAX_SPEC_LEN scenarios).

For SlowIncrementalDetokenizer, this was particularly inefficient as
each decode_next() call invoked detokenize_incrementally() with the
full token list, creating O(n) work per token for n tokens total.

**Solution:**
1. Refactored BaseIncrementalDetokenizer.update() to batch-process
   tokens when possible, using a new _decode_tokens_batch() method.

2. Special handling for min_tokens edge case: when crossing the
   min_tokens threshold during a batch, falls back to one-by-one
   processing to accurately track stop_check_offset for stop string
   detection.

3. Added SlowIncrementalDetokenizer._decode_tokens_batch() override
   that processes tokens more efficiently while maintaining correct
   incremental state updates.

4. FastIncrementalDetokenizer continues to use the default
   implementation (calling decode_next per token) since DecodeStream
   requires per-token state updates.

Fixes TODO in vllm/v1/engine/detokenizer.py:115-116

Signed-off-by: Mohammad Othman &lt;Mo@MohammadOthman.com&gt;
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
@@ -112,13 +112,32 @@ def update(self, new_token_ids: list[int], stop_terminated: bool) -> str | None:
             skipped_stop_token_id = None
 
         # 1) Detokenize the new token ids incrementally.
-        # TODO(woosuk): This method becomes very inefficient when the number of
-        # new_token_ids is more than 1. We need to optimize this.
+        # Optimization: batch process multiple tokens for efficiency.
         stop_check_offset = len(self.output_text)
-        for new_token_id in new_token_ids:
-            self.token_ids.append(new_token_id)
-            self.output_text += self.decode_next(new_token_id)
-            # Support min_tokens, see https://github.com/vllm-project/vllm/pull/22014
+
+        # Check if we need special handling for min_tokens
+        # If we might cross min_tokens threshold, process tokens one by one
+        # to accurately track the stop_check_offset position
+        if (
+            self.min_tokens
+            and len(self.output_token_ids) < self.min_tokens
+            and len(self.output_token_ids) + len(new_token_ids) > self.min_tokens
+        ):
+            # We will cross min_tokens during this batch
+            # Process one by one to track the exact position
+            for new_token_id in new_token_ids:
+                self.token_ids.append(new_token_id)
+                self.output_text += self.decode_next(new_token_id)
+                # Update stop_check_offset while we're still under min_tokens
+                if self.min_tokens and len(self.output_token_ids) <= self.min_tokens:
+                    stop_check_offset = len(self.output_text)
+        else:
+            # Fast path: batch process all tokens
+            self.token_ids.extend(new_token_ids)
+            new_text = self._decode_tokens_batch(new_token_ids)
+            self.output_text += new_text
+
+            # Update stop_check_offset if still under min_tokens
             if self.min_tokens and len(self.output_token_ids) <= self.min_tokens:
                 stop_check_offset = len(self.output_text)
 
@@ -142,6 +161,17 @@ def update(self, new_token_ids: list[int], stop_terminated: bool) -> str | None:
 
         return stop_string
 
+    def _decode_tokens_batch(self, token_ids: list[int]) -> str:
+        """Decode a batch of tokens efficiently.
+
+        Default implementation processes tokens one by one.
+        Subclasses can override for more efficient batch processing.
+        """
+        result = ""
+        for token_id in token_ids:
+            result += self.decode_next(token_id)
+        return result
+
     @abstractmethod
     def decode_next(self, next_token_id: int) -> str:
         raise NotImplementedError
@@ -312,6 +342,36 @@ def decode_next(self, next_token_id: int) -> str:
 
         return decoded_text
 
+    def _decode_tokens_batch(self, token_ids: list[int]) -> str:
+        """Optimized batch decoding for SlowIncrementalDetokenizer.
+
+        Processes multiple tokens more efficiently by calling
+        detokenize_incrementally once per token but with properly
+        accumulated state.
+        """
+        result = ""
+        base_len = len(self.token_ids) - len(token_ids)
+
+        for i, token_id in enumerate(token_ids):
+            new_tokens, decoded_text, prefix_offset, read_offset = (
+                detokenize_incrementally(
+                    tokenizer=self.tokenizer,
+                    all_input_ids=self.token_ids[: base_len + i + 1],
+                    prev_tokens=self.tokens,
+                    prefix_offset=self.prefix_offset,
+                    read_offset=self.read_offset,
+                    skip_special_tokens=self.skip_special_tokens,
+                    spaces_between_special_tokens=self.spaces_between_special_tokens,
+                )
+            )
+
+            self.tokens.extend(new_tokens)
+            self.prefix_offset = prefix_offset
+            self.read_offset = read_offset
+            result += decoded_text
+
+        return result
+
 
 def check_stop_strings(
     output_text: str,