Appearance
推测解码 — 代码走读
目录结构 — vllm/v1/spec_decode/
spec_decode/
├── llm_base_proposer.py # LLM 草稿模型基类 (75K)
├── ngram_proposer.py # N-gram 提议器
├── ngram_proposer_gpu.py # GPU 加速 N-gram (25K)
├── eagle.py # EAGLE 实现
├── medusa.py # Medusa 实现
├── dflash.py # DFlash 实现
├── suffix_decoding.py # 后缀解码
├── extract_hidden_states.py # 隐藏状态提取
└── gemma4.py # Gemma4 特化实现N-gram Proposer — ngram_proposer_gpu.py
python
class NGramProposerGPU:
"""GPU 加速的 N-gram 推测"""
def __init__(self, n=3, k=5):
self.n = n # n-gram 阶数
self.k = k # 候选 token 数
self.trie = GPUTrie() # GPU 上的 trie 结构
def propose(self, input_ids, num_speculative_tokens):
# 1. 在 trie 中查找最近的 n-gram 匹配
matches = self.trie.lookup(input_ids[:, -(self.n-1):])
# 2. 选择 top-k 候选
candidates = matches.topk(self.k)
# 3. 拼接候选 token
speculative_tokens = candidates[:, :num_speculative_tokens]
return speculative_tokensEAGLE — eagle.py
python
class EAGLEProposer:
"""EAGLE 推测解码实现"""
def __init__(self, target_model_config):
# 创建轻量草稿模型
self.draft_model = self._create_draft_model(target_model_config)
def propose(self, hidden_states, input_ids):
# 1. 从目标模型获取特征
features = hidden_states[-1] # 最后一层的隐藏状态
# 2. 草稿模型自回归生成候选 token
draft_tokens = []
draft_probs = []
current_input = features
for _ in range(self.num_speculative_tokens):
logits = self.draft_model(current_input)
probs = softmax(logits)
token = argmax(probs) # 贪心选择
draft_tokens.append(token)
draft_probs.append(probs)
current_input = self._embed_and_concat(current_input, token)
return draft_tokens, draft_probsRejection Sampler — vllm/v1/sample/rejection_sampler.py
python
class RejectionSampler:
"""验证草稿 token 并决定接受/拒绝"""
def forward(self, draft_tokens, draft_probs, target_probs):
# 1. 计算每个 token 的接受概率
accept_probs = torch.minimum(
torch.ones_like(draft_probs),
target_probs / draft_probs,
)
# 2. 采样接受/拒绝
random_values = torch.rand_like(accept_probs)
accepted = random_values < accept_probs
# 3. 找到第一个被拒绝的位置
rejected_mask = ~accepted
first_reject = rejected_mask.float().argmax(dim=-1)
# 4. 截断到第一个拒绝位置
output_tokens = draft_tokens.clone()
for i in range(draft_tokens.shape[0]):
output_tokens[i, first_reject[i]:] = 0 # 清零被拒绝的 token
# 从调整分布重新采样第一个被拒绝的 token
if first_reject[i] < draft_tokens.shape[1]:
adjusted = torch.clamp(
target_probs[i] - draft_probs[i], min=0
)
output_tokens[i, first_reject[i]] = sample(adjusted)
return output_tokens与 ModelRunner 的集成
推测解码在 ModelRunner 中通过 hooks 集成:
python
class ModelRunner:
def execute_model(self, scheduler_output):
# 标准前向传播
output = self._forward(input_batch)
# 如果启用推测解码
if self.speculative_config:
# 1. 草稿阶段:生成候选 token
draft_tokens = self.spec_proposer.propose(...)
# 2. 验证阶段:在目标模型中验证
verified = self.rejection_sampler(
draft_tokens, draft_probs, output.logits,
)
output.speculative_tokens = verified
return output关键函数索引
| 函数/类 | 文件 | 职责 |
|---|---|---|
NGramProposerGPU.propose() | spec_decode/ngram_proposer_gpu.py | N-gram 候选生成 |
EAGLEProposer.propose() | spec_decode/eagle.py | EAGLE 草稿生成 |
RejectionSampler.forward() | sample/rejection_sampler.py | 验证和接受/拒绝 |
LLMBaseProposer | spec_decode/llm_base_proposer.py | LLM 草稿模型基类 |
extract_hidden_states() | spec_decode/extract_hidden_states.py | 提取目标模型特征 |