Skip to content

多模态处理 — 代码走读

MultiModalRegistry — vllm/multimodal/registry.py

python
class MultiModalRegistry:
    def __init__(self):
        self._processors: dict[str, MultiModalProcessor] = {}

    def register_model(self, model_cls, processor_cls):
        self._processors[model_cls.__name__] = processor_cls()

    def get_processor(self, model_cls):
        return self._processors.get(model_cls.__name__)

MultiModalProcessor — vllm/multimodal/processing/processor.py

python
class MultiModalProcessor:
    def process(self, inputs: MultiModalInputs):
        # 1. 解析多模态数据
        image_data = inputs.get("image")
        audio_data = inputs.get("audio")
        video_data = inputs.get("video")

        # 2. 预处理各模态
        image_features = []
        if image_data:
            for img in image_data:
                pixel_values = self.image_processor(img)
                features = self.vision_encoder(pixel_values)
                image_features.append(features)

        # 3. 投影到语言模型空间
        projected = self.multi_modal_projector(image_features)

        # 4. 构建 inputs_embeds
        inputs_embeds = self._merge_text_and_features(
            text_embeds, projected, inputs["prompt"]
        )

        return inputs_embeds

Encoder Cache Manager — vllm/v1/core/encoder_cache_manager.py

python
class EncoderCacheManager:
    def __init__(self, cache_size, cache_dtype):
        self.cache = LRUCache(cache_size)
        self.cache_dtype = cache_dtype

    def get(self, cache_key):
        return self.cache.get(cache_key)

    def put(self, cache_key, features):
        self.cache.put(cache_key, features.to(self.cache_dtype))

相关输入类型 — vllm/multimodal/inputs.py

python
class MultiModalKwargs(TypedDict):
    image: Optional[list[ImageInput]]
    audio: Optional[list[AudioInput]]
    video: Optional[list[VideoInput]]

class MultiModalInputs:
    prompt: str
    token_ids: list[int]
    multi_modal_data: MultiModalKwargs

关键函数索引

函数/类文件职责
MultiModalRegistry.register_model()multimodal/registry.py注册模型的多模态处理器
MultiModalProcessor.process()multimodal/processing/processor.py处理多模态输入
EncoderCacheManager.get/put()v1/core/encoder_cache_manager.py编码器输出缓存
MultiModalInputsmultimodal/inputs.py多模态输入数据结构
ImageProcessormultimodal/image.py图像预处理
AudioProcessormultimodal/audio.py音频预处理