Skip to content

模型库与算子层 — 代码走读

LLaMA 模型实现 — vllm/model_executor/models/llama.py

模型结构

python
@ModelRegistry.register("LlamaForCausalLM")
class LlamaForCausalLM(nn.Module):
    def __init__(self, config, *, model_config, cache_config, quant_config):
        self.model = LlamaModel(config, ...)
        self.lm_head = ParallelLMHead(config.vocab_size, ...)

    def forward(self, input_ids, positions, kv_caches, ...):
        hidden = self.model(input_ids, positions, kv_caches)
        return self.lm_head(hidden)

    def load_weights(self, weights):
        # 权重加载逻辑
        ...

LlamaModel

python
class LlamaModel(nn.Module):
    def __init__(self, config, ...):
        self.embed_tokens = VocabParallelEmbedding(...)
        self.layers = nn.ModuleList([
            LlamaDecoderLayer(config, ...) for _ in range(config.num_hidden_layers)
        ])
        self.norm = RMSNorm(config.hidden_size, ...)

    def forward(self, input_ids, positions, kv_caches):
        hidden = self.embed_tokens(input_ids)
        for i, layer in enumerate(self.layers):
            hidden = layer(
                hidden, positions,
                kv_caches[i],  # 每层有独立的 KV 缓存
            )
        return self.norm(hidden)

LlamaDecoderLayer

python
class LlamaDecoderLayer(nn.Module):
    def __init__(self, config, ...):
        self.self_attn = Attention(...)  # 统一注意力层
        self.mlp = LlamaMLP(...)         # FFN
        self.input_layernorm = RMSNorm(...)
        self.post_attention_layernorm = RMSNorm(...)

    def forward(self, hidden, positions, kv_cache):
        # Pre-norm 架构
        residual = hidden
        hidden = self.input_layernorm(hidden)
        hidden = self.self_attn(hidden, positions, kv_cache)
        hidden = residual + hidden

        residual = hidden
        hidden = self.post_attention_layernorm(hidden)
        hidden = self.mlp(hidden)
        hidden = residual + hidden
        return hidden

Attention 层 — model_executor/layers/attention/

统一注意力接口

python
class Attention(nn.Module):
    def __init__(self, num_heads, head_dim, ...):
        self.q_proj = ColumnParallelLinear(...)
        self.k_proj = ColumnParallelLinear(...)
        self.v_proj = ColumnParallelLinear(...)
        self.o_proj = RowParallelLinear(...)

    def forward(self, hidden, positions, kv_cache):
        q = self.q_proj(hidden)
        k = self.k_proj(hidden)
        v = self.v_proj(hidden)

        # 应用 RoPE
        q, k = self.rotary_emb(positions, q, k)

        # PagedAttention:更新 KV 缓存
        k, v = self._update_kv_cache(k, v, kv_cache)

        # 调用注意力后端
        output = self.attn_backend.forward(
            q, k, v, kv_cache.block_table, ...
        )
        return self.o_proj(output)

Linear 层 — model_executor/layers/linear.py

并行线性层

python
class ColumnParallelLinear(nn.Module):
    """列切分:输出维度沿 TP rank 切分"""
    def __init__(self, input_size, output_size, ...):
        self.weight = Parameter(output_size // tp_size, input_size)

    def forward(self, x):
        output = F.linear(x, self.weight)
        # All-reduce 在后续 RowParallelLinear 中完成
        return output

class RowParallelLinear(nn.Module):
    """行切分:输入维度沿 TP rank 切分"""
    def __init__(self, input_size, output_size, ...):
        self.weight = Parameter(output_size, input_size // tp_size)

    def forward(self, x):
        output = F.linear(x, self.weight)
        output = tensor_model_parallel_all_reduce(output)
        return output

Fused MoE — model_executor/layers/fused_moe/

MoE 层实现

python
class FusedMoE(nn.Module):
    def __init__(self, num_experts, top_k, ...):
        self.w13 = Parameter(num_experts, 2 * intermediate_size, hidden_size)
        self.w2 = Parameter(num_experts, hidden_size, intermediate_size)
        self.router = nn.Linear(hidden_size, num_experts)

    def forward(self, hidden):
        # 1. Router 计算
        router_logits = self.router(hidden)
        top_k_weights, top_k_indices = torch.topk(router_logits, self.top_k)

        # 2. Fused MoE kernel
        output = fused_moe_kernel(
            hidden, self.w13, self.w2,
            top_k_weights, top_k_indices,
        )
        return output

关键函数索引

函数/类文件职责
ModelRegistry.register()models/registry.py注册模型架构
LlamaForCausalLM.forward()models/llama.pyLLaMA 前向传播
Attention.forward()layers/attention/统一注意力计算
ColumnParallelLinearlayers/linear.py列切分线性层
FusedMoE.forward()layers/fused_moe/MoE 前向传播
RMSNormlayers/layernorm.pyRMS 归一化
RotaryEmbeddinglayers/rotary_embedding/RoPE 位置编码