|
77 | 77 | V_SCALE_CONSTANT: int = 100
|
78 | 78 | VLLM_SERVER_DEV_MODE: bool = False
|
79 | 79 | VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
|
| 80 | + VLLM_MLA_DISABLE: bool = False |
80 | 81 | VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
|
81 | 82 |
|
82 | 83 |
|
@@ -302,10 +303,6 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
|
302 | 303 | "VLLM_FLASHINFER_FORCE_TENSOR_CORES":
|
303 | 304 | lambda: bool(int(os.getenv("VLLM_FLASHINFER_FORCE_TENSOR_CORES", "0"))),
|
304 | 305 |
|
305 |
| - # If set, vLLM will disable the MLA attention optimizations. |
306 |
| - "VLLM_DISABLE_MLA": |
307 |
| - lambda: bool(int(os.getenv("VLLM_DISABLE_MLA", "0"))), |
308 |
| - |
309 | 306 | # Pipeline stage partition strategy
|
310 | 307 | "VLLM_PP_LAYER_PARTITION":
|
311 | 308 | lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
|
@@ -512,6 +509,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
|
512 | 509 | "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE":
|
513 | 510 | lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")),
|
514 | 511 |
|
| 512 | + # If set, vLLM will disable the MLA attention optimizations. |
| 513 | + "VLLM_MLA_DISABLE": |
| 514 | + lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))), |
| 515 | + |
515 | 516 | # Flag that can control whether or not we perform matrix-absorption for MLA
|
516 | 517 | # decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the
|
517 | 518 | # matrices reduces the runtime FLOPs needed to compute MLA but requires
|
|
0 commit comments