bugfix: Fix cudagraph mode of BatchPrefillWithRaggedKVCacheWrapper (#412

) The computation of `fixed_batch_size` is not correct.
flashinfer-ai · Jul 30, 2024 · 9907bc1 · 9907bc1
1 parent 58d3593
commit 9907bc1
Showing 1 changed file with 2 additions and 2 deletions.
diff --git a/python/flashinfer/prefill.py b/python/flashinfer/prefill.py
@@ -1215,8 +1215,8 @@ def __init__(
                 raise ValueError(
                     "kv_indptr_buf should be a torch.Tensor in cuda graph mode"
                 )
-            self._fixed_batch_size = len(qo_indptr_buf)
-            if len(kv_indptr_buf) != self._fixed_batch_size:
+            self._fixed_batch_size = len(qo_indptr_buf) - 1
+            if len(kv_indptr_buf) != self._fixed_batch_size + 1:
                 raise ValueError(
                     "The length of kv_indptr_buf ({}) should be the same as qo_indptr_buf ({}).".format(
                         len(kv_indptr_buf), self._fixed_batch_size