Appearance
多模态处理 — 代码走读
MultiModalRegistry — vllm/multimodal/registry.py
python
class MultiModalRegistry:
def __init__(self):
self._processors: dict[str, MultiModalProcessor] = {}
def register_model(self, model_cls, processor_cls):
self._processors[model_cls.__name__] = processor_cls()
def get_processor(self, model_cls):
return self._processors.get(model_cls.__name__)MultiModalProcessor — vllm/multimodal/processing/processor.py
python
class MultiModalProcessor:
def process(self, inputs: MultiModalInputs):
# 1. 解析多模态数据
image_data = inputs.get("image")
audio_data = inputs.get("audio")
video_data = inputs.get("video")
# 2. 预处理各模态
image_features = []
if image_data:
for img in image_data:
pixel_values = self.image_processor(img)
features = self.vision_encoder(pixel_values)
image_features.append(features)
# 3. 投影到语言模型空间
projected = self.multi_modal_projector(image_features)
# 4. 构建 inputs_embeds
inputs_embeds = self._merge_text_and_features(
text_embeds, projected, inputs["prompt"]
)
return inputs_embedsEncoder Cache Manager — vllm/v1/core/encoder_cache_manager.py
python
class EncoderCacheManager:
def __init__(self, cache_size, cache_dtype):
self.cache = LRUCache(cache_size)
self.cache_dtype = cache_dtype
def get(self, cache_key):
return self.cache.get(cache_key)
def put(self, cache_key, features):
self.cache.put(cache_key, features.to(self.cache_dtype))相关输入类型 — vllm/multimodal/inputs.py
python
class MultiModalKwargs(TypedDict):
image: Optional[list[ImageInput]]
audio: Optional[list[AudioInput]]
video: Optional[list[VideoInput]]
class MultiModalInputs:
prompt: str
token_ids: list[int]
multi_modal_data: MultiModalKwargs关键函数索引
| 函数/类 | 文件 | 职责 |
|---|---|---|
MultiModalRegistry.register_model() | multimodal/registry.py | 注册模型的多模态处理器 |
MultiModalProcessor.process() | multimodal/processing/processor.py | 处理多模态输入 |
EncoderCacheManager.get/put() | v1/core/encoder_cache_manager.py | 编码器输出缓存 |
MultiModalInputs | multimodal/inputs.py | 多模态输入数据结构 |
ImageProcessor | multimodal/image.py | 图像预处理 |
AudioProcessor | multimodal/audio.py | 音频预处理 |