From 66767ecb68b893cb6eb67215529b744ac9a50c44 Mon Sep 17 00:00:00 2001
From: nayohan <nayohan13@gmail.com>
Date: Wed, 28 Aug 2024 10:39:56 +0900
Subject: [PATCH] [MOD] check ruff format

---
 vllm/model_executor/models/exaone.py      | 253 +++++++++++++---------
 vllm/transformers_utils/configs/exaone.py | 105 +++++----
 2 files changed, 217 insertions(+), 141 deletions(-)

diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 098377daada01..f4100f082b05e 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -22,6 +22,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Exaone model compatible with HuggingFace weights."""
+
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import torch
@@ -29,35 +30,51 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+    QuantizationConfig,
+)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    get_compressed_tensors_cache_scale)
+    get_compressed_tensors_cache_scale,
+)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    kv_cache_scales_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.transformers_utils.configs.exaone import ExaoneConfig
 from vllm.utils import is_hip
 
 from vllm.model_executor.models.interfaces import SupportsLoRA
-from vllm.model_executor.models.utils import PPMissingLayer, is_pp_missing_parameter, make_layers
+from vllm.model_executor.models.utils import (
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_layers,
+)
 
 
 class ExaoneGatedMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
@@ -73,15 +90,20 @@ def __init__(
             output_sizes=[intermediate_size] * 2,
             bias=bias,
             quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.c_proj = RowParallelLinear(input_size=intermediate_size,
-                                           output_size=hidden_size,
-                                           bias=bias,
-                                           quant_config=quant_config,
-                                           prefix=f"{prefix}.c_proj")
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.c_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
+        )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -92,7 +114,6 @@ def forward(self, x):
 
 
 class ExaoneAttention(nn.Module):
-
     def __init__(
         self,
         config: ExaoneConfig,
@@ -124,8 +145,9 @@ def __init__(
             assert tp_size % self.total_num_kv_heads == 0
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
         # MistralConfig has an optional head_dim introduced by Mistral-Nemo
-        self.head_dim = getattr(config, "head_dim",
-                                self.hidden_size // self.total_num_heads)
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -162,12 +184,14 @@ def __init__(
             rope_scaling=rope_scaling,
             is_neox_style=is_neox_style,
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config)
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+        )
 
     def forward(
         self,
@@ -185,7 +209,6 @@ def forward(
 
 
 class ExaoneBlockAttention(nn.Module):
-    
     def __init__(
         self,
         config: ExaoneConfig,
@@ -222,17 +245,15 @@ def forward(
         kv_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
-
         return self.attention(
             positions=positions,
             hidden_states=hidden_states,
             kv_cache=kv_cache,
-            attn_metadata=attn_metadata
+            attn_metadata=attn_metadata,
         )
 
 
 class ExaoneDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: ExaoneConfig,
@@ -245,21 +266,26 @@ def __init__(
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         if rope_scaling is not None and getattr(
-                config, "original_max_position_embeddings", None):
+            config, "original_max_position_embeddings", None
+        ):
             rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(
+            config, "max_position_embeddings", 8192
+        )
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
         attention_bias = getattr(config, "attention_bias", False) or getattr(
-            config, "bias", False)
+            config, "bias", False
+        )
         self.attn = ExaoneBlockAttention(
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
-            num_kv_heads=getattr(config, "num_key_value_heads",
-                                 config.num_attention_heads),
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
@@ -276,10 +302,8 @@ def __init__(
             bias=getattr(config, "mlp_bias", False),
             prefix=f"{prefix}.mlp",
         )
-        self.ln_1 = RMSNorm(config.hidden_size,
-                                       eps=config.layer_norm_epsilon)
-        self.ln_2 = RMSNorm(config.hidden_size,
-                                                eps=config.layer_norm_epsilon)
+        self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
     def forward(
         self,
@@ -294,8 +318,7 @@ def forward(
             residual = hidden_states
             hidden_states = self.ln_1(hidden_states)
         else:
-            hidden_states, residual = self.ln_1(
-                hidden_states, residual)
+            hidden_states, residual = self.ln_1(hidden_states, residual)
         hidden_states = self.attn(
             positions=positions,
             hidden_states=hidden_states,
@@ -304,14 +327,12 @@ def forward(
         )
 
         # Fully Connected
-        hidden_states, residual = self.ln_2(
-            hidden_states, residual)
+        hidden_states, residual = self.ln_2(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
 
 class ExaoneModel(nn.Module):
-
     def __init__(
         self,
         config: ExaoneConfig,
@@ -323,12 +344,16 @@ def __init__(
         super().__init__()
         self.config = config
         self.padding_idx = config.pad_token_id
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
+        lora_vocab = (
+            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
+            if lora_config
+            else 0
+        )
         self.vocab_size = config.vocab_size + lora_vocab
         self.wte = config.vocab_size
-        if get_pp_group().is_first_rank or (config.tie_word_embeddings
-                                            and get_pp_group().is_last_rank):
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
             self.wte = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
@@ -339,13 +364,18 @@ def __init__(
             self.wte = PPMissingLayer()
         self.start_layer, self.end_layer, self.h = make_layers(
             config.num_hidden_layers,
-            lambda prefix: ExaoneDecoderLayer(config=config,
-                                             cache_config=cache_config,
-                                             quant_config=quant_config,
-                                             prefix=prefix),
-            prefix=f"{prefix}.h")
+            lambda prefix: ExaoneDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.h",
+        )
         if get_pp_group().is_last_rank:
-            self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+            self.ln_f = RMSNorm(
+                config.hidden_size, eps=config.layer_norm_epsilon
+            )
         else:
             self.ln_f = PPMissingLayer()
 
@@ -383,10 +413,9 @@ def forward(
             )
 
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
 
         hidden_states, _ = self.ln_f(hidden_states, residual)
         return hidden_states
@@ -407,8 +436,12 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA):
 
     # LoRA specific attributes
     supported_lora_modules = [
-        "qkv_proj", "out_proj", "gate_up_proj", "c_proj", "wte",
-        "lm_head"
+        "qkv_proj",
+        "out_proj",
+        "gate_up_proj",
+        "c_proj",
+        "wte",
+        "lm_head",
     ]
     embedding_modules = {
         "wte": "input_embeddings",
@@ -436,11 +469,13 @@ def __init__(
         self.config = config
         self.lora_config = lora_config
 
-        self.transformer = ExaoneModel(config,
-                                cache_config,
-                                quant_config,
-                                lora_config=lora_config,
-                                prefix="model")
+        self.transformer = ExaoneModel(
+            config,
+            cache_config,
+            quant_config,
+            lora_config=lora_config,
+            prefix="model",
+        )
         if get_pp_group().is_last_rank:
             self.unpadded_vocab_size = config.vocab_size
             if lora_config:
@@ -452,16 +487,17 @@ def __init__(
                 padding_size=DEFAULT_VOCAB_PADDING_SIZE
                 # We need bigger padding if using lora for kernel
                 # compatibility
-                if not lora_config else lora_config.lora_vocab_padding_size,
+                if not lora_config
+                else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
             )
             if config.tie_word_embeddings:
                 self.lm_head.weight = self.transformer.wte.weight
 
             logit_scale = getattr(config, "logit_scale", 1.0)
-            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                    config.vocab_size,
-                                                    logit_scale)
+            self.logits_processor = LogitsProcessor(
+                self.unpadded_vocab_size, config.vocab_size, logit_scale
+            )
             self.sampler = Sampler()
         else:
             self.lm_head = PPMissingLayer()
@@ -474,8 +510,9 @@ def forward(
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        model_output = self.transformer(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors)
+        model_output = self.transformer(
+            input_ids, positions, kv_caches, attn_metadata, intermediate_tensors
+        )
         return model_output
 
     def compute_logits(
@@ -483,8 +520,9 @@ def compute_logits(
         hidden_states: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
+        logits = self.logits_processor(
+            self.lm_head, hidden_states, sampling_metadata
+        )
         return logits
 
     def sample(
@@ -496,18 +534,22 @@ def sample(
         return next_tokens
 
     def make_empty_intermediate_tensors(
-            self, batch_size: int, dtype: torch.dtype,
-            device: torch.device) -> IntermediateTensors:
-        return IntermediateTensors({
-            "hidden_states":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-            "residual":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-        })
+        self, batch_size: int, dtype: torch.dtype, device: torch.device
+    ) -> IntermediateTensors:
+        return IntermediateTensors(
+            {
+                "hidden_states": torch.zeros(
+                    (batch_size, self.config.hidden_size),
+                    dtype=dtype,
+                    device=device,
+                ),
+                "residual": torch.zeros(
+                    (batch_size, self.config.hidden_size),
+                    dtype=dtype,
+                    device=device,
+                ),
+            }
+        )
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
@@ -522,8 +564,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
+            if (
+                "rotary_emb.cos_cached" in name
+                or "rotary_emb.sin_cached" in name
+            ):
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
@@ -535,12 +579,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             if scale_name := get_compressed_tensors_cache_scale(name):
                 # Loading kv cache scales for compressed-tensors quantization
                 param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(
+                    param, "weight_loader", default_weight_loader
+                )
                 loaded_weight = loaded_weight[0]
                 weight_loader(param, loaded_weight)
                 continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -569,8 +614,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     continue
 
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(
+                    param, "weight_loader", default_weight_loader
+                )
                 weight_loader(param, loaded_weight)
 
     # If this function is called, it should always initialize KV cache scale
@@ -580,9 +626,12 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
         tp_size = get_tensor_model_parallel_world_size()
         tp_rank = get_tensor_model_parallel_rank()
         for layer_idx, scaling_factor in kv_cache_scales_loader(
-                quantization_param_path, tp_rank, tp_size,
-                self.config.num_hidden_layers,
-                self.config.__class__.model_type):
+            quantization_param_path,
+            tp_rank,
+            tp_size,
+            self.config.num_hidden_layers,
+            self.config.__class__.model_type,
+        ):
             if not isinstance(self.transformer.h[layer_idx], nn.Identity):
                 layer_self_attn = self.transformer.h[layer_idx].attn
 
@@ -595,5 +644,7 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
             if hasattr(layer_self_attn, "kv_scale"):
                 layer_self_attn.attn._kv_scale = scaling_factor
             else:
-                raise RuntimeError("Self attention has no KV cache scaling "
-                                   "factor attribute!")
\ No newline at end of file
+                raise RuntimeError(
+                    "Self attention has no KV cache scaling "
+                    "factor attribute!"
+                )
diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py
index 91433cddc95ed..13baf4f18c5ed 100644
--- a/vllm/transformers_utils/configs/exaone.py
+++ b/vllm/transformers_utils/configs/exaone.py
@@ -14,74 +14,92 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Exaone model configuration """
+"""Exaone model configuration"""
 
+from typing import Dict
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 
 
 logger = logging.get_logger(__name__)
 
-EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-}
+EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, str] = {}
 
 
 class ExaoneConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.ExaoneModel`. It is used to
-    instantiate a GPT Lingvo model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the Exaone
-
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    This is the configuration class to store the configuration of a :class:
+    `~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
+    according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the Exaone
 
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig`
+    and can be used to control the model outputs. Read the documentation from :
+    class:`~transformers.PretrainedConfig` for more information.
 
     Args:
         vocab_size (:obj:`int`, `optional`, defaults to 50257):
-            Vocabulary size of the GPT Lingvo model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.ExaoneModel`. Vocabulary size of the model.
-            Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of
-            :class:`~transformers.EXAONEModel`.
+            Vocabulary size of the GPT Lingvo model. Defines the number of
+            different tokens that can be represented by the :obj:`inputs_ids`
+            passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
+            size of the model.
+            Defines the different tokens that can be represented by the
+            `inputs_ids` passed to the forward method of :class:
+            `~transformers.EXAONEModel`.
         hidden_size (:obj:`int`, `optional`, defaults to 2048):
             Dimensionality of the encoder layers and the pooler layer.
         num_layers (:obj:`int`, `optional`, defaults to 24):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer decoder.
+            Number of attention heads for each attention layer in the
+            Transformer decoder.
         num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi
+            Head Attention (MHA), if `num_key_value_heads=1 the model will use
+            Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint,
+            each group key and value head should be constructed by meanpooling
+            all the original heads within that group. For more details checkout
+            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
+            specified, will default to `num_attention_heads`.
         rotary_pct (`float`, *optional*, defaults to 0.25):
             percentage of hidden dimensions to allocate to rotary embeddings
         intermediate_size (:obj:`int`, `optional`, defaults to 8192):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu_new"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in
+            the Transformer encoder.
+        activation_function (:obj:`str` or :obj:`function`, `optional`,
+        defaults to :obj:`"gelu_new"`):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
+            :obj:`"selu"` and :obj:`"gelu_new"` are supported.
         embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probabilitiy for all fully connected layers in the
+            embeddings, encoder, and pooler.
         attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
         type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.EXAONEModel`.
+            The vocabulary size of the :obj:`token_type_ids` passed when calling
+            :class:`~transformers.EXAONEModel`.
         initializer_range (:obj:`float`, `optional`, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
         layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
             The epsilon used by the layer normalization layers.
         use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
-        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
-
+            Whether or not the model should return the last key/values
+            attentions (not used by all models).
+            Only relevant if ``config.is_decoder=True``.
+        gradient_checkpointing (:obj:`bool`, `optional`,
+        defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense
+            of slower backward pass.
         Example::
 
             >>> from transformers import ExoneModel, ExaoneConfig
@@ -95,6 +113,7 @@ class ExaoneConfig(PretrainedConfig):
             >>> # Accessing the model configuration
             >>> configuration = model.config
     """
+
     model_type = "exaone"
     keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {"num_hidden_layers": "num_layers"}
@@ -119,9 +138,14 @@ def __init__(
         bos_token_id=0,
         eos_token_id=2,
         tie_word_embeddings=True,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs)
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
 
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
@@ -135,7 +159,7 @@ def __init__(
         if intermediate_size:
             self.intermediate_size = intermediate_size
         else:
-            self.intermediate_size = hidden_size *  4
+            self.intermediate_size = hidden_size * 4
         self.activation_function = activation_function
         self.resid_dropout = resid_dropout
         self.embed_dropout = embed_dropout
@@ -148,7 +172,6 @@ def __init__(
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
 
-        #[HYUNJIK] additionl configuation
         self.use_logit_cap = kwargs.pop("use_logit_cap", False)
         self.ln_no_scale = kwargs.pop("ln_no_scale", False)
         self.use_gated = kwargs.pop("use_gated", False)
@@ -161,6 +184,8 @@ def __init__(
         self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
         self.rotary_base = kwargs.pop("rotary_base", 10000.0)
         self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
-        self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head", (rotary_pct == 0.25))
+        self.rescale_before_lm_head = kwargs.pop(
+            "rescale_before_lm_head", (rotary_pct == 0.25)
+        )
         if self.use_rotary_pos:
             self.use_absolute_pos = False