Commit 25a9b5c 1 parent 73fe175 commit 25a9b5c Copy full SHA for 25a9b5c
File tree 1 file changed +4
-8
lines changed
vllm/v1/attention/backends
1 file changed +4
-8
lines changed Original file line number Diff line number Diff line change @@ -82,8 +82,10 @@ def __init__(
82
82
if alibi_slopes is not None :
83
83
alibi_slopes = torch .tensor (alibi_slopes , dtype = torch .float32 )
84
84
self .alibi_slopes = alibi_slopes
85
- self .sliding_window = ((sliding_window , sliding_window )
86
- if sliding_window is not None else (- 1 , - 1 ))
85
+ if sliding_window is None :
86
+ self .sliding_window = (- 1 , - 1 )
87
+ else :
88
+ self .sliding_window = (sliding_window - 1 , 0 )
87
89
self .kv_cache_dtype = kv_cache_dtype
88
90
if logits_soft_cap is None :
89
91
# In flash-attn, setting logits_soft_cap as 0 means no soft cap.
@@ -93,12 +95,6 @@ def __init__(
93
95
assert self .num_heads % self .num_kv_heads == 0
94
96
self .num_queries_per_kv = self .num_heads // self .num_kv_heads
95
97
96
- if sliding_window is not None :
97
- # NOTE(woosuk): flash-attn's sliding window does not work with
98
- # paged KV cache.
99
- raise ValueError (
100
- "Sliding window is not supported in FlashAttention." )
101
-
102
98
support_head_sizes = FlashAttentionBackend .get_supported_head_sizes ()
103
99
if head_size not in support_head_sizes :
104
100
raise ValueError (
You can’t perform that action at this time.
0 commit comments