Appearance
KV缓存与PagedAttention — 代码走读
KVCacheManager — vllm/v1/core/kv_cache_manager.py
初始化
python
class KVCacheManager:
def __init__(self, block_size, num_blocks, ...):
self.block_size = block_size
self.block_pool = BlockPool(num_blocks)
self.block_hashes: dict[BlockHash, KVCacheBlock] = {}
self.req_to_blocks: dict[RequestID, list[KVCacheBlock]] = {}块分配流程
python
def allocate(self, request, num_tokens):
# 1. 计算需要的块数
total_blocks_needed = cdiv(
request.num_computed_tokens + num_tokens,
self.block_size,
)
current_blocks = self.req_to_blocks[request.request_id]
num_new_blocks = total_blocks_needed - len(current_blocks)
# 2. 前缀缓存检查
new_blocks = []
for i in range(num_new_blocks):
block_idx = len(current_blocks) + i
# 计算块内容的哈希
block_hash = self._compute_block_hash(request, block_idx)
# 检查是否有可复用的块
if block_hash in self.block_hashes:
cached_block = self.block_hashes[block_hash]
cached_block.ref_count += 1
new_blocks.append(cached_block)
else:
# 从空闲池分配
block = self.block_pool.allocate()
block.hash = block_hash
self.block_hashes[block_hash] = block
new_blocks.append(block)
current_blocks.extend(new_blocks)
return new_blocks块释放
python
def free(self, request):
blocks = self.req_to_blocks.pop(request.request_id)
for block in blocks:
block.ref_count -= 1
if block.ref_count == 0:
# 引用归零,归还空闲池
del self.block_hashes[block.hash]
self.block_pool.free(block)BlockPool — vllm/v1/core/block_pool.py
python
class BlockPool:
"""空闲块池,管理所有可用的物理块"""
def __init__(self, num_blocks):
self.blocks = [KVCacheBlock(id=i) for i in range(num_blocks)]
self.free_block_indices = set(range(num_blocks))
@property
def num_free_blocks(self):
return len(self.free_block_indices)
def allocate(self):
if not self.free_block_indices:
return None
block_id = self.free_block_indices.pop()
return self.blocks[block_id]
def free(self, block):
block.reset()
self.free_block_indices.add(block.id)KV Cache Utils — vllm/v1/core/kv_cache_utils.py
块哈希计算
python
class BlockHash:
"""块的哈希值,用于前缀缓存匹配"""
# 结合块内容和父块哈希的链式哈希
pass
def compute_block_hash(content_hash, parent_hash, block_idx):
return hash((content_hash, parent_hash, block_idx))KV Cache 配置生成
python
def generate_kv_cache_config(
model_config, cache_config, parallel_config
) -> KVCacheConfig:
# 计算每层的 KV 缓存大小
head_dim = model_config.get_head_dim()
num_layers = model_config.get_num_layers()
dtype_size = get_dtype_size(cache_config.cache_dtype)
# 每个 block 的字节数
block_size_bytes = (
block_size * head_dim * num_kv_heads * dtype_size * 2 # K + V
)
# 总可用显存
available_memory = get_available_gpu_memory()
# 计算可用块数
num_blocks = available_memory // block_size_bytes
return KVCacheConfig(
block_size=block_size,
num_blocks=num_blocks,
)KV Cache Coordinator — vllm/v1/core/kv_cache_coordinator.py
协调多个缓存组(例如 sliding window + full attention):
python
class KVCacheCoordinator:
def __init__(self, managers: list[KVCacheManager]):
self.managers = managers
def allocate(self, request, num_tokens):
for manager in self.managers:
blocks = manager.allocate(request, num_tokens)
if blocks is None:
# 任一管理器分配失败则全部回滚
self._rollback_allocations()
return None
return True关键函数索引
| 函数/类 | 文件 | 职责 |
|---|---|---|
KVCacheManager.allocate() | v1/core/kv_cache_manager.py | 分配 KV 缓存块 |
KVCacheManager.free() | v1/core/kv_cache_manager.py | 释放请求的缓存块 |
BlockPool.allocate() | v1/core/block_pool.py | 从空闲池分配物理块 |
generate_kv_cache_config() | v1/core/kv_cache_utils.py | 计算缓存配置参数 |
compute_block_hash() | v1/core/kv_cache_utils.py | 计算块哈希(前缀缓存) |
KVCacheCoordinator.allocate() | v1/core/kv_cache_coordinator.py | 协调多组缓存分配 |