Skip to content

Commit 266282d

Browse files
ywang96ywang96
authored and
Ubuntu
committed
[V1] Fix multimodal profiling for Molmo (vllm-project#11325)
Signed-off-by: ywang96 <ywang@example.com> Co-authored-by: ywang96 <ywang@example.com>
1 parent 2a36fa7 commit 266282d

File tree

4 files changed

+24
-4
lines changed

4 files changed

+24
-4
lines changed

vllm/model_executor/models/molmo.py

+5
Original file line numberDiff line numberDiff line change
@@ -928,7 +928,11 @@ def image_input_mapper_for_molmo(
928928
data: object,
929929
):
930930
if isinstance(data, list):
931+
assert len(data) == 1, "Molmo supports only one image per prompt."
931932
data = data[0]
933+
934+
# Remove unused dummy PIL image
935+
data.pop('raw_mm_data', None)
932936
return MultiModalKwargs(data)
933937

934938

@@ -974,6 +978,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
974978
dummy_imgdata = {
975979
"images": out["images"],
976980
"image_input_idx": out["image_input_idx"],
981+
"raw_mm_data": dummy_image,
977982
}
978983
if "image_masks" in out:
979984
dummy_imgdata["image_masks"] = out["image_masks"]

vllm/v1/engine/mm_input_mapper.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -151,17 +151,31 @@ class MMHasher:
151151
def __init__(self):
152152
pass
153153

154-
def hash_mm_data(
154+
def hash_dummy_mm_data(
155155
self,
156156
mm_data: Optional[MultiModalDataDict]) -> Optional[List[str]]:
157+
"""Hash user-defined dummy multimodal data used for profiling."""
158+
157159
if mm_data is None:
158160
return None
159161

160162
image_inputs = mm_data['image']
161163

164+
# This is a temporary workaround for models (e.g, Molmo) that
165+
# process multimodal data in the input processor (therefore
166+
# image_inputs is MultiModalKwargs instead of raw input format).
167+
# `raw_mm_data` with the original input format is expected
168+
# in this case.
169+
if isinstance(image_inputs, dict):
170+
assert "raw_mm_data" in image_inputs and isinstance(
171+
image_inputs["raw_mm_data"], PIL.Image.Image)
172+
image_inputs = image_inputs.pop("raw_mm_data")
173+
162174
return self.hash_images(image_inputs)
163175

164-
def hash_prompt(self, prompt: PromptType) -> Optional[List[str]]:
176+
def hash_prompt_mm_data(self, prompt: PromptType) -> Optional[List[str]]:
177+
"""Hash multimodal data in the user input prompt if they exist."""
178+
165179
if "multi_modal_data" not in prompt:
166180
return None
167181

@@ -171,6 +185,7 @@ def hash_prompt(self, prompt: PromptType) -> Optional[List[str]]:
171185
return self.hash_images(image_inputs)
172186

173187
def hash_images(self, image_inputs) -> Optional[List[str]]:
188+
"""Hash PIL image objects to strings."""
174189
if not isinstance(image_inputs, list):
175190
image_inputs = [image_inputs]
176191
assert len(image_inputs) > 0

vllm/v1/engine/processor.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def process_inputs(
7979
# Compute MM hashes (if enabled)
8080
mm_hashes = None
8181
if self.use_hash:
82-
mm_hashes = self.mm_hasher.hash_prompt(prompt)
82+
mm_hashes = self.mm_hasher.hash_prompt_mm_data(prompt)
8383

8484
# Process inputs.
8585
preprocessed_inputs = self.input_preprocessor.preprocess(

vllm/v1/worker/gpu_model_runner.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -638,7 +638,7 @@ def profile_run(self) -> None:
638638
# Compute MM hashes (if enabled)
639639
mm_hashes = None
640640
if self.use_hash:
641-
mm_hashes = self.mm_hasher.hash_mm_data(dummy_mm_data)
641+
mm_hashes = self.mm_hasher.hash_dummy_mm_data(dummy_mm_data)
642642

643643
dummy_mm_kwargs = self.mm_input_mapper_client.process_inputs(
644644
mm_data=dummy_mm_data,

0 commit comments

Comments
 (0)