From 66767ecb68b893cb6eb67215529b744ac9a50c44 Mon Sep 17 00:00:00 2001 From: nayohan Date: Wed, 28 Aug 2024 10:39:56 +0900 Subject: [PATCH] [MOD] check ruff format --- vllm/model_executor/models/exaone.py | 253 +++++++++++++--------- vllm/transformers_utils/configs/exaone.py | 105 +++++---- 2 files changed, 217 insertions(+), 141 deletions(-) diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 098377daada01..f4100f082b05e 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -22,6 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Exaone model compatible with HuggingFace weights.""" + from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import torch @@ -29,35 +30,51 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, LoRAConfig -from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size) +from vllm.distributed import ( + get_pp_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) +from vllm.model_executor.layers.linear import ( + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) + QuantizationConfig, +) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) + get_compressed_tensors_cache_scale, +) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) + DEFAULT_VOCAB_PADDING_SIZE, + ParallelLMHead, + VocabParallelEmbedding, +) from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) + default_weight_loader, + kv_cache_scales_loader, + maybe_remap_kv_scale_name, +) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors, SamplerOutput from vllm.transformers_utils.configs.exaone import ExaoneConfig from vllm.utils import is_hip from vllm.model_executor.models.interfaces import SupportsLoRA -from vllm.model_executor.models.utils import PPMissingLayer, is_pp_missing_parameter, make_layers +from vllm.model_executor.models.utils import ( + PPMissingLayer, + is_pp_missing_parameter, + make_layers, +) class ExaoneGatedMLP(nn.Module): - def __init__( self, hidden_size: int, @@ -73,15 +90,20 @@ def __init__( output_sizes=[intermediate_size] * 2, bias=bias, quant_config=quant_config, - prefix=f"{prefix}.gate_up_proj") - self.c_proj = RowParallelLinear(input_size=intermediate_size, - output_size=hidden_size, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.c_proj") + prefix=f"{prefix}.gate_up_proj", + ) + self.c_proj = RowParallelLinear( + input_size=intermediate_size, + output_size=hidden_size, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.c_proj", + ) if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") + raise ValueError( + f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now." + ) self.act_fn = SiluAndMul() def forward(self, x): @@ -92,7 +114,6 @@ def forward(self, x): class ExaoneAttention(nn.Module): - def __init__( self, config: ExaoneConfig, @@ -124,8 +145,9 @@ def __init__( assert tp_size % self.total_num_kv_heads == 0 self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) # MistralConfig has an optional head_dim introduced by Mistral-Nemo - self.head_dim = getattr(config, "head_dim", - self.hidden_size // self.total_num_heads) + self.head_dim = getattr( + config, "head_dim", self.hidden_size // self.total_num_heads + ) self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 @@ -162,12 +184,14 @@ def __init__( rope_scaling=rope_scaling, is_neox_style=is_neox_style, ) - self.attn = Attention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - quant_config=quant_config) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + ) def forward( self, @@ -185,7 +209,6 @@ def forward( class ExaoneBlockAttention(nn.Module): - def __init__( self, config: ExaoneConfig, @@ -222,17 +245,15 @@ def forward( kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, ) -> torch.Tensor: - return self.attention( positions=positions, hidden_states=hidden_states, kv_cache=kv_cache, - attn_metadata=attn_metadata + attn_metadata=attn_metadata, ) class ExaoneDecoderLayer(nn.Module): - def __init__( self, config: ExaoneConfig, @@ -245,21 +266,26 @@ def __init__( rope_theta = getattr(config, "rope_theta", 10000) rope_scaling = getattr(config, "rope_scaling", None) if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None): + config, "original_max_position_embeddings", None + ): rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings) - max_position_embeddings = getattr(config, "max_position_embeddings", - 8192) + config.original_max_position_embeddings + ) + max_position_embeddings = getattr( + config, "max_position_embeddings", 8192 + ) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias attention_bias = getattr(config, "attention_bias", False) or getattr( - config, "bias", False) + config, "bias", False + ) self.attn = ExaoneBlockAttention( config=config, hidden_size=self.hidden_size, num_heads=config.num_attention_heads, - num_kv_heads=getattr(config, "num_key_value_heads", - config.num_attention_heads), + num_kv_heads=getattr( + config, "num_key_value_heads", config.num_attention_heads + ), rope_theta=rope_theta, rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, @@ -276,10 +302,8 @@ def __init__( bias=getattr(config, "mlp_bias", False), prefix=f"{prefix}.mlp", ) - self.ln_1 = RMSNorm(config.hidden_size, - eps=config.layer_norm_epsilon) - self.ln_2 = RMSNorm(config.hidden_size, - eps=config.layer_norm_epsilon) + self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) + self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) def forward( self, @@ -294,8 +318,7 @@ def forward( residual = hidden_states hidden_states = self.ln_1(hidden_states) else: - hidden_states, residual = self.ln_1( - hidden_states, residual) + hidden_states, residual = self.ln_1(hidden_states, residual) hidden_states = self.attn( positions=positions, hidden_states=hidden_states, @@ -304,14 +327,12 @@ def forward( ) # Fully Connected - hidden_states, residual = self.ln_2( - hidden_states, residual) + hidden_states, residual = self.ln_2(hidden_states, residual) hidden_states = self.mlp(hidden_states) return hidden_states, residual class ExaoneModel(nn.Module): - def __init__( self, config: ExaoneConfig, @@ -323,12 +344,16 @@ def __init__( super().__init__() self.config = config self.padding_idx = config.pad_token_id - lora_vocab = (lora_config.lora_extra_vocab_size * - (lora_config.max_loras or 1)) if lora_config else 0 + lora_vocab = ( + (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) + if lora_config + else 0 + ) self.vocab_size = config.vocab_size + lora_vocab self.wte = config.vocab_size - if get_pp_group().is_first_rank or (config.tie_word_embeddings - and get_pp_group().is_last_rank): + if get_pp_group().is_first_rank or ( + config.tie_word_embeddings and get_pp_group().is_last_rank + ): self.wte = VocabParallelEmbedding( self.vocab_size, config.hidden_size, @@ -339,13 +364,18 @@ def __init__( self.wte = PPMissingLayer() self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, - lambda prefix: ExaoneDecoderLayer(config=config, - cache_config=cache_config, - quant_config=quant_config, - prefix=prefix), - prefix=f"{prefix}.h") + lambda prefix: ExaoneDecoderLayer( + config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, + ), + prefix=f"{prefix}.h", + ) if get_pp_group().is_last_rank: - self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) + self.ln_f = RMSNorm( + config.hidden_size, eps=config.layer_norm_epsilon + ) else: self.ln_f = PPMissingLayer() @@ -383,10 +413,9 @@ def forward( ) if not get_pp_group().is_last_rank: - return IntermediateTensors({ - "hidden_states": hidden_states, - "residual": residual - }) + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) hidden_states, _ = self.ln_f(hidden_states, residual) return hidden_states @@ -407,8 +436,12 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA): # LoRA specific attributes supported_lora_modules = [ - "qkv_proj", "out_proj", "gate_up_proj", "c_proj", "wte", - "lm_head" + "qkv_proj", + "out_proj", + "gate_up_proj", + "c_proj", + "wte", + "lm_head", ] embedding_modules = { "wte": "input_embeddings", @@ -436,11 +469,13 @@ def __init__( self.config = config self.lora_config = lora_config - self.transformer = ExaoneModel(config, - cache_config, - quant_config, - lora_config=lora_config, - prefix="model") + self.transformer = ExaoneModel( + config, + cache_config, + quant_config, + lora_config=lora_config, + prefix="model", + ) if get_pp_group().is_last_rank: self.unpadded_vocab_size = config.vocab_size if lora_config: @@ -452,16 +487,17 @@ def __init__( padding_size=DEFAULT_VOCAB_PADDING_SIZE # We need bigger padding if using lora for kernel # compatibility - if not lora_config else lora_config.lora_vocab_padding_size, + if not lora_config + else lora_config.lora_vocab_padding_size, quant_config=quant_config, ) if config.tie_word_embeddings: self.lm_head.weight = self.transformer.wte.weight logit_scale = getattr(config, "logit_scale", 1.0) - self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, - config.vocab_size, - logit_scale) + self.logits_processor = LogitsProcessor( + self.unpadded_vocab_size, config.vocab_size, logit_scale + ) self.sampler = Sampler() else: self.lm_head = PPMissingLayer() @@ -474,8 +510,9 @@ def forward( attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, ) -> Union[torch.Tensor, IntermediateTensors]: - model_output = self.transformer(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + model_output = self.transformer( + input_ids, positions, kv_caches, attn_metadata, intermediate_tensors + ) return model_output def compute_logits( @@ -483,8 +520,9 @@ def compute_logits( hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor( + self.lm_head, hidden_states, sampling_metadata + ) return logits def sample( @@ -496,18 +534,22 @@ def sample( return next_tokens def make_empty_intermediate_tensors( - self, batch_size: int, dtype: torch.dtype, - device: torch.device) -> IntermediateTensors: - return IntermediateTensors({ - "hidden_states": - torch.zeros((batch_size, self.config.hidden_size), - dtype=dtype, - device=device), - "residual": - torch.zeros((batch_size, self.config.hidden_size), - dtype=dtype, - device=device), - }) + self, batch_size: int, dtype: torch.dtype, device: torch.device + ) -> IntermediateTensors: + return IntermediateTensors( + { + "hidden_states": torch.zeros( + (batch_size, self.config.hidden_size), + dtype=dtype, + device=device, + ), + "residual": torch.zeros( + (batch_size, self.config.hidden_size), + dtype=dtype, + device=device, + ), + } + ) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ @@ -522,8 +564,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue - if ("rotary_emb.cos_cached" in name - or "rotary_emb.sin_cached" in name): + if ( + "rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name + ): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue @@ -535,12 +579,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): if scale_name := get_compressed_tensors_cache_scale(name): # Loading kv cache scales for compressed-tensors quantization param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) loaded_weight = loaded_weight[0] weight_loader(param, loaded_weight) continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: + for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue name = name.replace(weight_name, param_name) @@ -569,8 +614,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): continue param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) weight_loader(param, loaded_weight) # If this function is called, it should always initialize KV cache scale @@ -580,9 +626,12 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None: tp_size = get_tensor_model_parallel_world_size() tp_rank = get_tensor_model_parallel_rank() for layer_idx, scaling_factor in kv_cache_scales_loader( - quantization_param_path, tp_rank, tp_size, - self.config.num_hidden_layers, - self.config.__class__.model_type): + quantization_param_path, + tp_rank, + tp_size, + self.config.num_hidden_layers, + self.config.__class__.model_type, + ): if not isinstance(self.transformer.h[layer_idx], nn.Identity): layer_self_attn = self.transformer.h[layer_idx].attn @@ -595,5 +644,7 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None: if hasattr(layer_self_attn, "kv_scale"): layer_self_attn.attn._kv_scale = scaling_factor else: - raise RuntimeError("Self attention has no KV cache scaling " - "factor attribute!") \ No newline at end of file + raise RuntimeError( + "Self attention has no KV cache scaling " + "factor attribute!" + ) diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py index 91433cddc95ed..13baf4f18c5ed 100644 --- a/vllm/transformers_utils/configs/exaone.py +++ b/vllm/transformers_utils/configs/exaone.py @@ -14,74 +14,92 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Exaone model configuration """ +"""Exaone model configuration""" +from typing import Dict from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging logger = logging.get_logger(__name__) -EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP = { -} +EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, str] = {} class ExaoneConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a :class:`~transformers.ExaoneModel`. It is used to - instantiate a GPT Lingvo model according to the specified arguments, defining the model architecture. Instantiating a - configuration with the defaults will yield a similar configuration to that of the Exaone - - Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model - outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. + This is the configuration class to store the configuration of a :class: + `~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model + according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar + configuration to that of the Exaone + Configuration objects inherit from :class:`~transformers.PretrainedConfig` + and can be used to control the model outputs. Read the documentation from : + class:`~transformers.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, `optional`, defaults to 50257): - Vocabulary size of the GPT Lingvo model. Defines the number of different tokens that can be represented by the - :obj:`inputs_ids` passed when calling :class:`~transformers.ExaoneModel`. Vocabulary size of the model. - Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of - :class:`~transformers.EXAONEModel`. + Vocabulary size of the GPT Lingvo model. Defines the number of + different tokens that can be represented by the :obj:`inputs_ids` + passed when calling :class:`~transformers.ExaoneModel`. Vocabulary + size of the model. + Defines the different tokens that can be represented by the + `inputs_ids` passed to the forward method of :class: + `~transformers.EXAONEModel`. hidden_size (:obj:`int`, `optional`, defaults to 2048): Dimensionality of the encoder layers and the pooler layer. num_layers (:obj:`int`, `optional`, defaults to 24): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the Transformer decoder. + Number of attention heads for each attention layer in the + Transformer decoder. num_key_value_heads (`int`, *optional*): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to - `num_attention_heads`. + This is the number of key_value heads that should be used to + implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi + Head Attention (MHA), if `num_key_value_heads=1 the model will use + Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, + each group key and value head should be constructed by meanpooling + all the original heads within that group. For more details checkout + [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not + specified, will default to `num_attention_heads`. rotary_pct (`float`, *optional*, defaults to 0.25): percentage of hidden dimensions to allocate to rotary embeddings intermediate_size (:obj:`int`, `optional`, defaults to 8192): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. - activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu_new"`): - The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported. + Dimensionality of the "intermediate" (i.e., feed-forward) layer in + the Transformer encoder. + activation_function (:obj:`str` or :obj:`function`, `optional`, + defaults to :obj:`"gelu_new"`): + The non-linear activation function (function or string) in the + encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`, + :obj:`"selu"` and :obj:`"gelu_new"` are supported. embed_dropout (:obj:`float`, `optional`, defaults to 0.0): - The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + The dropout probabilitiy for all fully connected layers in the + embeddings, encoder, and pooler. attention_dropout (:obj:`float`, `optional`, defaults to 0.0): The dropout ratio for the attention probabilities. max_position_embeddings (:obj:`int`, `optional`, defaults to 2048): - The maximum sequence length that this model might ever be used with. Typically set this to something large - just in case (e.g., 512 or 1024 or 2048). + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). type_vocab_size (:obj:`int`, `optional`, defaults to 2): - The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.EXAONEModel`. + The vocabulary size of the :obj:`token_type_ids` passed when calling + :class:`~transformers.EXAONEModel`. initializer_range (:obj:`float`, `optional`, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + The standard deviation of the truncated_normal_initializer for + initializing all weight matrices. layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5): The epsilon used by the layer normalization layers. use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if ``config.is_decoder=True``. - gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`): - If True, use gradient checkpointing to save memory at the expense of slower backward pass. - + Whether or not the model should return the last key/values + attentions (not used by all models). + Only relevant if ``config.is_decoder=True``. + gradient_checkpointing (:obj:`bool`, `optional`, + defaults to :obj:`False`): + If True, use gradient checkpointing to save memory at the expense + of slower backward pass. Example:: >>> from transformers import ExoneModel, ExaoneConfig @@ -95,6 +113,7 @@ class ExaoneConfig(PretrainedConfig): >>> # Accessing the model configuration >>> configuration = model.config """ + model_type = "exaone" keys_to_ignore_at_inference = ["past_key_values"] attribute_map = {"num_hidden_layers": "num_layers"} @@ -119,9 +138,14 @@ def __init__( bos_token_id=0, eos_token_id=2, tie_word_embeddings=True, - **kwargs + **kwargs, ): - super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings @@ -135,7 +159,7 @@ def __init__( if intermediate_size: self.intermediate_size = intermediate_size else: - self.intermediate_size = hidden_size * 4 + self.intermediate_size = hidden_size * 4 self.activation_function = activation_function self.resid_dropout = resid_dropout self.embed_dropout = embed_dropout @@ -148,7 +172,6 @@ def __init__( self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id - #[HYUNJIK] additionl configuation self.use_logit_cap = kwargs.pop("use_logit_cap", False) self.ln_no_scale = kwargs.pop("ln_no_scale", False) self.use_gated = kwargs.pop("use_gated", False) @@ -161,6 +184,8 @@ def __init__( self.rotary_expand_length = kwargs.pop("rotary_expand_length", None) self.rotary_base = kwargs.pop("rotary_base", 10000.0) self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False) - self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head", (rotary_pct == 0.25)) + self.rescale_before_lm_head = kwargs.pop( + "rescale_before_lm_head", (rotary_pct == 0.25) + ) if self.use_rotary_pos: self.use_absolute_pos = False