From bd75f96b97c1f4e83f4d890b8d5b7c360db7c062 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <lwilkins@redhat.com>
Date: Mon, 3 Feb 2025 07:35:56 -0800
Subject: [PATCH] update comment

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/envs.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index b49c40efe22d1..8751e972dc563 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -541,10 +541,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
                  ),
 
-    # Align single entrys (within a page) so they are 256 byte aligned for 
-    # better performance, this increases the memory usage of the cache. 
-    # Currenlty this primarily affects MLA that results in non-256 byte aligned
-    # entrys.
+    # When on a Nvidia GPU aligns single entrys (within a page) so they are 256
+    # byte aligned for  better performance, this increases the memory usage of 
+    # the cache. Currently this primarily affects MLA that results in non-256
+    # byte aligned entrys. This mathches the alginment the CUDA runtime uses
+    # for all allocations. Currently this primarily affects MLA, for most other
+    # models the alignment is already naturally aligned to 256 bytes.
     "VLLM_CUDA_MEM_ALIGN_KV_CACHE":
     lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))),
 }