|
48 | 48 | from vllm.sequence import IntermediateTensors
|
49 | 49 |
|
50 | 50 | from .interfaces import SupportsLoRA, SupportsPP
|
51 |
| -from .utils import (is_pp_missing_parameter, |
| 51 | +from .utils import (extract_layer_index, is_pp_missing_parameter, |
52 | 52 | make_empty_intermediate_tensors_factory, make_layers,
|
53 | 53 | maybe_prefix)
|
54 | 54 |
|
@@ -171,12 +171,26 @@ def __init__(
|
171 | 171 | rope_scaling=self.rope_scaling,
|
172 | 172 | is_neox_style=False,
|
173 | 173 | )
|
| 174 | + |
| 175 | + sliding_window = getattr(config, "sliding_window", None) |
| 176 | + # Model v2 has sliding windows, v1 does not |
| 177 | + self.v1 = sliding_window is None |
| 178 | + |
| 179 | + layer_idx = extract_layer_index(prefix) |
| 180 | + layer_has_sliding_window = ( |
| 181 | + getattr(config, "sliding_window_pattern", False) |
| 182 | + and (layer_idx + 1) % self.config.sliding_window_pattern != 0) |
| 183 | + |
| 184 | + self.sliding_window = (sliding_window |
| 185 | + if layer_has_sliding_window else None) |
| 186 | + |
174 | 187 | self.attn = Attention(self.num_heads,
|
175 | 188 | self.head_dim,
|
176 | 189 | self.scaling,
|
177 | 190 | num_kv_heads=self.num_kv_heads,
|
178 | 191 | cache_config=cache_config,
|
179 | 192 | quant_config=quant_config,
|
| 193 | + per_layer_sliding_window=self.sliding_window, |
180 | 194 | prefix=f"{prefix}.attn")
|
181 | 195 | if self.use_qk_norm:
|
182 | 196 | self.q_norm = LayerNorm(param_shape=(self.num_heads,
|
@@ -206,7 +220,8 @@ def forward(
|
206 | 220 | q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
207 | 221 | if self.use_qk_norm:
|
208 | 222 | q, k = self._apply_qk_norm(q, k)
|
209 |
| - q, k = self.rotary_emb(positions, q, k) |
| 223 | + if self.v1 or self.sliding_window: |
| 224 | + q, k = self.rotary_emb(positions, q, k) |
210 | 225 | attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
|
211 | 226 | output, _ = self.o_proj(attn_output)
|
212 | 227 | return output
|
|
0 commit comments