From bd75f96b97c1f4e83f4d890b8d5b7c360db7c062 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 3 Feb 2025 07:35:56 -0800 Subject: [PATCH] update comment Signed-off-by: Lucas Wilkinson --- vllm/envs.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index b49c40efe22d1..8751e972dc563 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -541,10 +541,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0")) ), - # Align single entrys (within a page) so they are 256 byte aligned for - # better performance, this increases the memory usage of the cache. - # Currenlty this primarily affects MLA that results in non-256 byte aligned - # entrys. + # When on a Nvidia GPU aligns single entrys (within a page) so they are 256 + # byte aligned for better performance, this increases the memory usage of + # the cache. Currently this primarily affects MLA that results in non-256 + # byte aligned entrys. This mathches the alginment the CUDA runtime uses + # for all allocations. Currently this primarily affects MLA, for most other + # models the alignment is already naturally aligned to 256 bytes. "VLLM_CUDA_MEM_ALIGN_KV_CACHE": lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))), }