@@ -165,7 +165,6 @@ class ModelConfig:
165
165
`logits_processors` extra completion argument. Defaults to None,
166
166
which allows no processors.
167
167
generation_config: Configuration parameter file for generation.
168
- disable_mla: Whether to disable MLA for DeepSeek models.
169
168
override_generation_config: Override the generation config with the
170
169
given config.
171
170
"""
@@ -227,7 +226,6 @@ def __init__(
227
226
override_pooler_config : Optional ["PoolerConfig" ] = None ,
228
227
logits_processor_pattern : Optional [str ] = None ,
229
228
generation_config : Optional [str ] = None ,
230
- disable_mla : bool = False ,
231
229
enable_sleep_mode : bool = False ,
232
230
override_generation_config : Optional [Dict [str , Any ]] = None ,
233
231
) -> None :
@@ -278,7 +276,6 @@ def __init__(
278
276
self .max_logprobs = max_logprobs
279
277
self .disable_sliding_window = disable_sliding_window
280
278
self .skip_tokenizer_init = skip_tokenizer_init
281
- self .disable_mla = disable_mla
282
279
self .enable_sleep_mode = enable_sleep_mode
283
280
284
281
from vllm .platforms import current_platform
@@ -748,7 +745,7 @@ def is_deepseek_mla(self) -> bool:
748
745
def get_head_size (self ) -> int :
749
746
# TODO remove hard code
750
747
if self .is_deepseek_mla :
751
- if self .should_use_mla :
748
+ if self .use_mla :
752
749
return self .hf_text_config .kv_lora_rank
753
750
else :
754
751
qk_rope_head_dim = getattr (self .hf_text_config ,
@@ -815,7 +812,7 @@ def get_total_num_kv_heads(self) -> int:
815
812
816
813
def get_num_kv_heads (self , parallel_config : "ParallelConfig" ) -> int :
817
814
"""Returns the number of KV heads per GPU."""
818
- if self .should_use_mla :
815
+ if self .use_mla :
819
816
# When using MLA during decode it becomes MQA
820
817
return 1
821
818
@@ -971,8 +968,7 @@ def is_cross_encoder(self) -> bool:
971
968
972
969
@property
973
970
def use_mla (self ) -> bool :
974
- use_mla = (self .is_deepseek_mla and not self .disable_mla
975
- and not envs .VLLM_MLA_DISABLE )
971
+ use_mla = (self .is_deepseek_mla and not envs .VLLM_MLA_DISABLE )
976
972
return use_mla
977
973
978
974
def supported_runner_types (self ) -> Set [RunnerType ]:
0 commit comments