diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index de3777cad0589..19effcbfc5512 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -75,19 +75,20 @@ def __init__(self, vision_hidden_size: int, text_hidden_size: int, projector_hidden_act: str, + multimodal_projector_bias: bool, quant_config: Optional[QuantizationConfig] = None, prefix: str = ""): super().__init__() self.linear_1 = ColumnParallelLinear(vision_hidden_size, text_hidden_size, - bias=True, + bias=multimodal_projector_bias, quant_config=quant_config, prefix=f"{prefix}.linear_1") self.act = get_act_fn(projector_hidden_act) self.linear_2 = RowParallelLinear(text_hidden_size, text_hidden_size, - bias=True, + bias=multimodal_projector_bias, quant_config=quant_config, prefix=f"{prefix}.linear_2") @@ -503,6 +504,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, projector_hidden_act=config.projector_hidden_act, + multimodal_projector_bias=config.multimodal_projector_bias, quant_config=quant_config, prefix=maybe_prefix(prefix, "multi_modal_projector")) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 185edcb8de11f..defdeb54afb6a 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -231,7 +231,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.multi_modal_projector = LlavaMultiModalProjector( vision_hidden_size=vision_hidden_size, text_hidden_size=config.text_config.hidden_size, - projector_hidden_act=config.projector_hidden_act) + projector_hidden_act=config.projector_hidden_act, + multimodal_projector_bias=config.multimodal_projector_bias) self.language_model = init_vllm_registered_model( vllm_config=vllm_config, diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index a5002513554db..d70ae2f148ff9 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -253,16 +253,16 @@ def forward(self, image_features: torch.Tensor): class LlavaNextMultiModalProjector(nn.Module): def __init__(self, vision_hidden_size: int, text_hidden_size: int, - projector_hidden_act: str): + projector_hidden_act: str, multimodal_projector_bias: bool): super().__init__() self.linear_1 = nn.Linear(vision_hidden_size, text_hidden_size, - bias=True) + bias=multimodal_projector_bias) self.act = get_act_fn(projector_hidden_act) self.linear_2 = nn.Linear(text_hidden_size, text_hidden_size, - bias=True) + bias=multimodal_projector_bias) def forward(self, image_features: torch.Tensor) -> torch.Tensor: hidden_states = self.linear_1(image_features) @@ -298,7 +298,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.multi_modal_projector = LlavaNextMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, - projector_hidden_act=config.projector_hidden_act) + projector_hidden_act=config.projector_hidden_act, + multimodal_projector_bias=config.multimodal_projector_bias) self.language_model = init_vllm_registered_model( vllm_config=vllm_config, hf_config=config.text_config, diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index ac502000c3ee7..f1c06cd85967c 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -372,11 +372,11 @@ def __init__(self, config: LlavaOnevisionConfig): self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, - bias=True) + bias=config.multimodal_projector_bias) self.act = get_act_fn(config.projector_hidden_act) self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, - bias=True) + bias=config.multimodal_projector_bias) def forward(self, image_features: torch.Tensor) -> torch.Tensor: hidden_states = self.linear_1(image_features)