Appearance
模型库与算子层 — 代码走读
LLaMA 模型实现 — vllm/model_executor/models/llama.py
模型结构
python
@ModelRegistry.register("LlamaForCausalLM")
class LlamaForCausalLM(nn.Module):
def __init__(self, config, *, model_config, cache_config, quant_config):
self.model = LlamaModel(config, ...)
self.lm_head = ParallelLMHead(config.vocab_size, ...)
def forward(self, input_ids, positions, kv_caches, ...):
hidden = self.model(input_ids, positions, kv_caches)
return self.lm_head(hidden)
def load_weights(self, weights):
# 权重加载逻辑
...LlamaModel
python
class LlamaModel(nn.Module):
def __init__(self, config, ...):
self.embed_tokens = VocabParallelEmbedding(...)
self.layers = nn.ModuleList([
LlamaDecoderLayer(config, ...) for _ in range(config.num_hidden_layers)
])
self.norm = RMSNorm(config.hidden_size, ...)
def forward(self, input_ids, positions, kv_caches):
hidden = self.embed_tokens(input_ids)
for i, layer in enumerate(self.layers):
hidden = layer(
hidden, positions,
kv_caches[i], # 每层有独立的 KV 缓存
)
return self.norm(hidden)LlamaDecoderLayer
python
class LlamaDecoderLayer(nn.Module):
def __init__(self, config, ...):
self.self_attn = Attention(...) # 统一注意力层
self.mlp = LlamaMLP(...) # FFN
self.input_layernorm = RMSNorm(...)
self.post_attention_layernorm = RMSNorm(...)
def forward(self, hidden, positions, kv_cache):
# Pre-norm 架构
residual = hidden
hidden = self.input_layernorm(hidden)
hidden = self.self_attn(hidden, positions, kv_cache)
hidden = residual + hidden
residual = hidden
hidden = self.post_attention_layernorm(hidden)
hidden = self.mlp(hidden)
hidden = residual + hidden
return hiddenAttention 层 — model_executor/layers/attention/
统一注意力接口
python
class Attention(nn.Module):
def __init__(self, num_heads, head_dim, ...):
self.q_proj = ColumnParallelLinear(...)
self.k_proj = ColumnParallelLinear(...)
self.v_proj = ColumnParallelLinear(...)
self.o_proj = RowParallelLinear(...)
def forward(self, hidden, positions, kv_cache):
q = self.q_proj(hidden)
k = self.k_proj(hidden)
v = self.v_proj(hidden)
# 应用 RoPE
q, k = self.rotary_emb(positions, q, k)
# PagedAttention:更新 KV 缓存
k, v = self._update_kv_cache(k, v, kv_cache)
# 调用注意力后端
output = self.attn_backend.forward(
q, k, v, kv_cache.block_table, ...
)
return self.o_proj(output)Linear 层 — model_executor/layers/linear.py
并行线性层
python
class ColumnParallelLinear(nn.Module):
"""列切分:输出维度沿 TP rank 切分"""
def __init__(self, input_size, output_size, ...):
self.weight = Parameter(output_size // tp_size, input_size)
def forward(self, x):
output = F.linear(x, self.weight)
# All-reduce 在后续 RowParallelLinear 中完成
return output
class RowParallelLinear(nn.Module):
"""行切分:输入维度沿 TP rank 切分"""
def __init__(self, input_size, output_size, ...):
self.weight = Parameter(output_size, input_size // tp_size)
def forward(self, x):
output = F.linear(x, self.weight)
output = tensor_model_parallel_all_reduce(output)
return outputFused MoE — model_executor/layers/fused_moe/
MoE 层实现
python
class FusedMoE(nn.Module):
def __init__(self, num_experts, top_k, ...):
self.w13 = Parameter(num_experts, 2 * intermediate_size, hidden_size)
self.w2 = Parameter(num_experts, hidden_size, intermediate_size)
self.router = nn.Linear(hidden_size, num_experts)
def forward(self, hidden):
# 1. Router 计算
router_logits = self.router(hidden)
top_k_weights, top_k_indices = torch.topk(router_logits, self.top_k)
# 2. Fused MoE kernel
output = fused_moe_kernel(
hidden, self.w13, self.w2,
top_k_weights, top_k_indices,
)
return output关键函数索引
| 函数/类 | 文件 | 职责 |
|---|---|---|
ModelRegistry.register() | models/registry.py | 注册模型架构 |
LlamaForCausalLM.forward() | models/llama.py | LLaMA 前向传播 |
Attention.forward() | layers/attention/ | 统一注意力计算 |
ColumnParallelLinear | layers/linear.py | 列切分线性层 |
FusedMoE.forward() | layers/fused_moe/ | MoE 前向传播 |
RMSNorm | layers/layernorm.py | RMS 归一化 |
RotaryEmbedding | layers/rotary_embedding/ | RoPE 位置编码 |