|
27 | 27 | from vllm.transformers_utils.config import (
|
28 | 28 | ConfigFormat, get_config, get_hf_image_processor_config,
|
29 | 29 | get_hf_text_config, get_pooling_config,
|
30 |
| - get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope) |
| 30 | + get_sentence_transformer_tokenizer_config, is_encoder_decoder, |
| 31 | + try_get_generation_config, uses_mrope) |
31 | 32 | from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
|
32 | 33 | get_cpu_memory, print_warning_once, random_uuid,
|
33 | 34 | resolve_obj_by_qualname)
|
@@ -160,6 +161,7 @@ class ModelConfig:
|
160 | 161 | logits processor qualified names that can be passed with the
|
161 | 162 | `logits_processors` extra completion argument. Defaults to None,
|
162 | 163 | which allows no processors.
|
| 164 | + generation_config: Configuration parameter file for generation. |
163 | 165 | """
|
164 | 166 |
|
165 | 167 | def compute_hash(self) -> str:
|
@@ -218,7 +220,8 @@ def __init__(self,
|
218 | 220 | disable_mm_preprocessor_cache: bool = False,
|
219 | 221 | override_neuron_config: Optional[Dict[str, Any]] = None,
|
220 | 222 | override_pooler_config: Optional["PoolerConfig"] = None,
|
221 |
| - logits_processor_pattern: Optional[str] = None) -> None: |
| 223 | + logits_processor_pattern: Optional[str] = None, |
| 224 | + generation_config: Optional[str] = None) -> None: |
222 | 225 | self.model = model
|
223 | 226 | self.tokenizer = tokenizer
|
224 | 227 | self.tokenizer_mode = tokenizer_mode
|
@@ -348,6 +351,8 @@ def __init__(self,
|
348 | 351 | self.pooler_config = self._init_pooler_config(override_pooler_config)
|
349 | 352 | self.logits_processor_pattern = logits_processor_pattern
|
350 | 353 |
|
| 354 | + self.generation_config = generation_config |
| 355 | + |
351 | 356 | self._verify_quantization()
|
352 | 357 | self._verify_cuda_graph()
|
353 | 358 | self._verify_bnb_config()
|
@@ -813,6 +818,56 @@ def get_multimodal_config(self) -> "MultiModalConfig":
|
813 | 818 |
|
814 | 819 | return self.multimodal_config
|
815 | 820 |
|
| 821 | + def try_get_generation_config(self) -> Dict[str, Any]: |
| 822 | + if self.generation_config is None or self.generation_config == "auto": |
| 823 | + config = try_get_generation_config( |
| 824 | + self.model, |
| 825 | + trust_remote_code=self.trust_remote_code, |
| 826 | + revision=self.revision, |
| 827 | + ) |
| 828 | + else: |
| 829 | + config = try_get_generation_config( |
| 830 | + self.generation_config, |
| 831 | + trust_remote_code=self.trust_remote_code, |
| 832 | + ) |
| 833 | + |
| 834 | + if config is None: |
| 835 | + return {} |
| 836 | + |
| 837 | + return config.to_diff_dict() |
| 838 | + |
| 839 | + def get_diff_sampling_param(self) -> Dict[str, Any]: |
| 840 | + """ |
| 841 | + This method returns a dictionary containing the parameters |
| 842 | + that differ from the default sampling parameters, but only |
| 843 | + if `generation_config` is set. If `generation_config` is not |
| 844 | + set, an empty dictionary is returned. |
| 845 | +
|
| 846 | + Returns: |
| 847 | + Dict[str, Any]: A dictionary with the differing sampling |
| 848 | + parameters if `generation_config` is set, otherwise an |
| 849 | + empty dictionary. |
| 850 | + """ |
| 851 | + if self.generation_config is None: |
| 852 | + # When generation_config is not set |
| 853 | + return {} |
| 854 | + config = self.try_get_generation_config() |
| 855 | + available_params = [ |
| 856 | + "repetition_penalty", |
| 857 | + "temperature", |
| 858 | + "top_k", |
| 859 | + "top_p", |
| 860 | + "min_p", |
| 861 | + ] |
| 862 | + if any(p in config for p in available_params): |
| 863 | + diff_sampling_param = { |
| 864 | + p: config.get(p) |
| 865 | + for p in available_params if config.get(p) is not None |
| 866 | + } |
| 867 | + else: |
| 868 | + diff_sampling_param = {} |
| 869 | + return diff_sampling_param |
| 870 | + |
816 | 871 | @property
|
817 | 872 | def is_encoder_decoder(self) -> bool:
|
818 | 873 | """Extract the HF encoder/decoder model flag."""
|
|
0 commit comments