Skip to content

推测解码 — 代码走读

目录结构 — vllm/v1/spec_decode/

spec_decode/
├── llm_base_proposer.py     # LLM 草稿模型基类 (75K)
├── ngram_proposer.py         # N-gram 提议器
├── ngram_proposer_gpu.py     # GPU 加速 N-gram (25K)
├── eagle.py                  # EAGLE 实现
├── medusa.py                 # Medusa 实现
├── dflash.py                 # DFlash 实现
├── suffix_decoding.py        # 后缀解码
├── extract_hidden_states.py  # 隐藏状态提取
└── gemma4.py                 # Gemma4 特化实现

N-gram Proposer — ngram_proposer_gpu.py

python
class NGramProposerGPU:
    """GPU 加速的 N-gram 推测"""

    def __init__(self, n=3, k=5):
        self.n = n          # n-gram 阶数
        self.k = k          # 候选 token 数
        self.trie = GPUTrie()  # GPU 上的 trie 结构

    def propose(self, input_ids, num_speculative_tokens):
        # 1. 在 trie 中查找最近的 n-gram 匹配
        matches = self.trie.lookup(input_ids[:, -(self.n-1):])

        # 2. 选择 top-k 候选
        candidates = matches.topk(self.k)

        # 3. 拼接候选 token
        speculative_tokens = candidates[:, :num_speculative_tokens]
        return speculative_tokens

EAGLE — eagle.py

python
class EAGLEProposer:
    """EAGLE 推测解码实现"""

    def __init__(self, target_model_config):
        # 创建轻量草稿模型
        self.draft_model = self._create_draft_model(target_model_config)

    def propose(self, hidden_states, input_ids):
        # 1. 从目标模型获取特征
        features = hidden_states[-1]  # 最后一层的隐藏状态

        # 2. 草稿模型自回归生成候选 token
        draft_tokens = []
        draft_probs = []
        current_input = features

        for _ in range(self.num_speculative_tokens):
            logits = self.draft_model(current_input)
            probs = softmax(logits)
            token = argmax(probs)  # 贪心选择

            draft_tokens.append(token)
            draft_probs.append(probs)
            current_input = self._embed_and_concat(current_input, token)

        return draft_tokens, draft_probs

Rejection Sampler — vllm/v1/sample/rejection_sampler.py

python
class RejectionSampler:
    """验证草稿 token 并决定接受/拒绝"""

    def forward(self, draft_tokens, draft_probs, target_probs):
        # 1. 计算每个 token 的接受概率
        accept_probs = torch.minimum(
            torch.ones_like(draft_probs),
            target_probs / draft_probs,
        )

        # 2. 采样接受/拒绝
        random_values = torch.rand_like(accept_probs)
        accepted = random_values < accept_probs

        # 3. 找到第一个被拒绝的位置
        rejected_mask = ~accepted
        first_reject = rejected_mask.float().argmax(dim=-1)

        # 4. 截断到第一个拒绝位置
        output_tokens = draft_tokens.clone()
        for i in range(draft_tokens.shape[0]):
            output_tokens[i, first_reject[i]:] = 0  # 清零被拒绝的 token

            # 从调整分布重新采样第一个被拒绝的 token
            if first_reject[i] < draft_tokens.shape[1]:
                adjusted = torch.clamp(
                    target_probs[i] - draft_probs[i], min=0
                )
                output_tokens[i, first_reject[i]] = sample(adjusted)

        return output_tokens

与 ModelRunner 的集成

推测解码在 ModelRunner 中通过 hooks 集成:

python
class ModelRunner:
    def execute_model(self, scheduler_output):
        # 标准前向传播
        output = self._forward(input_batch)

        # 如果启用推测解码
        if self.speculative_config:
            # 1. 草稿阶段:生成候选 token
            draft_tokens = self.spec_proposer.propose(...)

            # 2. 验证阶段:在目标模型中验证
            verified = self.rejection_sampler(
                draft_tokens, draft_probs, output.logits,
            )

            output.speculative_tokens = verified

        return output

关键函数索引

函数/类文件职责
NGramProposerGPU.propose()spec_decode/ngram_proposer_gpu.pyN-gram 候选生成
EAGLEProposer.propose()spec_decode/eagle.pyEAGLE 草稿生成
RejectionSampler.forward()sample/rejection_sampler.py验证和接受/拒绝
LLMBaseProposerspec_decode/llm_base_proposer.pyLLM 草稿模型基类
extract_hidden_states()spec_decode/extract_hidden_states.py提取目标模型特征