We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent b046119 commit 0c59e9dCopy full SHA for 0c59e9d
vllm/v1/spec_decode/eagle.py
@@ -654,7 +654,9 @@ def load_model(self, target_model: nn.Module) -> None:
654
self.hot_token_ids = None
655
if self.vllm_config.speculative_config.draft_vocab_pruned:
656
logger.info(f"Loading pruned draft model vocabulary from {self.vllm_config.speculative_config.draft_vocab_pruned}")
657
- self.hot_token_ids = load_draft_vocab_pruned(self.vllm_config.speculative_config.draft_vocab_pruned).to(self.model.device)
+ self.hot_token_ids = load_draft_vocab_pruned(self.vllm_config.speculative_config.draft_vocab_pruned)
658
+ device = next(self.model.model.parameters()).device
659
+ self.hot_token_ids = self.hot_token_ids.to(device)
660
head = self.model.model.embed_tokens.weight
661
662
0 commit comments