From 18e35ddda5b83658e5fc5366599ea59e2415f213 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 3 Feb 2025 16:29:56 -0500 Subject: [PATCH] Squelch MLA warning for Compressed-Tensors Models (#12704) Signed-off-by: Kyle Sayers Signed-off-by: saeediy --- vllm/config.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index d70a637956edf..2f4a7ad769d98 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -986,6 +986,9 @@ def is_cross_encoder(self) -> bool: @property def use_mla(self) -> bool: + if not self.is_deepseek_mla or envs.VLLM_MLA_DISABLE: + return False + if self.quantization is not None and self.quantization not in [\ "fp8", "compressed-tensors"]: logger.warning( @@ -1012,8 +1015,7 @@ def use_mla(self) -> bool: quant_config) return False - use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE) - return use_mla + return True @property def supported_runner_types(self) -> Set[RunnerType]: