Skip to content

KV缓存与PagedAttention — 代码走读

KVCacheManager — vllm/v1/core/kv_cache_manager.py

初始化

python
class KVCacheManager:
    def __init__(self, block_size, num_blocks, ...):
        self.block_size = block_size
        self.block_pool = BlockPool(num_blocks)
        self.block_hashes: dict[BlockHash, KVCacheBlock] = {}
        self.req_to_blocks: dict[RequestID, list[KVCacheBlock]] = {}

块分配流程

python
def allocate(self, request, num_tokens):
    # 1. 计算需要的块数
    total_blocks_needed = cdiv(
        request.num_computed_tokens + num_tokens,
        self.block_size,
    )
    current_blocks = self.req_to_blocks[request.request_id]
    num_new_blocks = total_blocks_needed - len(current_blocks)

    # 2. 前缀缓存检查
    new_blocks = []
    for i in range(num_new_blocks):
        block_idx = len(current_blocks) + i
        # 计算块内容的哈希
        block_hash = self._compute_block_hash(request, block_idx)

        # 检查是否有可复用的块
        if block_hash in self.block_hashes:
            cached_block = self.block_hashes[block_hash]
            cached_block.ref_count += 1
            new_blocks.append(cached_block)
        else:
            # 从空闲池分配
            block = self.block_pool.allocate()
            block.hash = block_hash
            self.block_hashes[block_hash] = block
            new_blocks.append(block)

    current_blocks.extend(new_blocks)
    return new_blocks

块释放

python
def free(self, request):
    blocks = self.req_to_blocks.pop(request.request_id)
    for block in blocks:
        block.ref_count -= 1
        if block.ref_count == 0:
            # 引用归零,归还空闲池
            del self.block_hashes[block.hash]
            self.block_pool.free(block)

BlockPool — vllm/v1/core/block_pool.py

python
class BlockPool:
    """空闲块池,管理所有可用的物理块"""

    def __init__(self, num_blocks):
        self.blocks = [KVCacheBlock(id=i) for i in range(num_blocks)]
        self.free_block_indices = set(range(num_blocks))

    @property
    def num_free_blocks(self):
        return len(self.free_block_indices)

    def allocate(self):
        if not self.free_block_indices:
            return None
        block_id = self.free_block_indices.pop()
        return self.blocks[block_id]

    def free(self, block):
        block.reset()
        self.free_block_indices.add(block.id)

KV Cache Utils — vllm/v1/core/kv_cache_utils.py

块哈希计算

python
class BlockHash:
    """块的哈希值,用于前缀缓存匹配"""
    # 结合块内容和父块哈希的链式哈希
    pass

def compute_block_hash(content_hash, parent_hash, block_idx):
    return hash((content_hash, parent_hash, block_idx))

KV Cache 配置生成

python
def generate_kv_cache_config(
    model_config, cache_config, parallel_config
) -> KVCacheConfig:
    # 计算每层的 KV 缓存大小
    head_dim = model_config.get_head_dim()
    num_layers = model_config.get_num_layers()
    dtype_size = get_dtype_size(cache_config.cache_dtype)

    # 每个 block 的字节数
    block_size_bytes = (
        block_size * head_dim * num_kv_heads * dtype_size * 2  # K + V
    )

    # 总可用显存
    available_memory = get_available_gpu_memory()

    # 计算可用块数
    num_blocks = available_memory // block_size_bytes

    return KVCacheConfig(
        block_size=block_size,
        num_blocks=num_blocks,
    )

KV Cache Coordinator — vllm/v1/core/kv_cache_coordinator.py

协调多个缓存组(例如 sliding window + full attention):

python
class KVCacheCoordinator:
    def __init__(self, managers: list[KVCacheManager]):
        self.managers = managers

    def allocate(self, request, num_tokens):
        for manager in self.managers:
            blocks = manager.allocate(request, num_tokens)
            if blocks is None:
                # 任一管理器分配失败则全部回滚
                self._rollback_allocations()
                return None
        return True

关键函数索引

函数/类文件职责
KVCacheManager.allocate()v1/core/kv_cache_manager.py分配 KV 缓存块
KVCacheManager.free()v1/core/kv_cache_manager.py释放请求的缓存块
BlockPool.allocate()v1/core/block_pool.py从空闲池分配物理块
generate_kv_cache_config()v1/core/kv_cache_utils.py计算缓存配置参数
compute_block_hash()v1/core/kv_cache_utils.py计算块哈希(前缀缓存)
KVCacheCoordinator.allocate()v1/core/kv_cache_coordinator.py协调多组缓存分配