1
1
import itertools
2
+ import json
2
3
import warnings
3
4
from contextlib import contextmanager
4
5
from typing import (Any , ClassVar , Dict , List , Optional , Sequence , Tuple , Type ,
9
10
from vllm import envs
10
11
from vllm .beam_search import (BeamSearchInstance , BeamSearchOutput ,
11
12
BeamSearchSequence , get_beam_search_score )
13
+ from vllm .config import CompilationConfig
12
14
from vllm .engine .arg_utils import (EngineArgs , HfOverrides , PoolerConfig ,
13
15
TaskOption )
14
16
from vllm .engine .llm_engine import LLMEngine
@@ -107,13 +109,16 @@ class LLM:
107
109
hf_overrides: If a dictionary, contains arguments to be forwarded to the
108
110
HuggingFace config. If a callable, it is called to update the
109
111
HuggingFace config.
112
+ compilation_config: Either an integer or a dictionary. If it is an integer,
113
+ it is used as the level of compilation optimization. If it is a dictionary,
114
+ it can specify the full compilation configuration.
110
115
**kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
111
116
:ref:`engine_args`)
112
117
113
118
Note:
114
119
This class is intended to be used for offline inference. For online
115
120
serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
116
- """
121
+ """ # noqa
117
122
118
123
DEPRECATE_LEGACY : ClassVar [bool ] = False
119
124
"""A flag to toggle whether to deprecate the legacy generate/encode API."""
@@ -166,6 +171,7 @@ def __init__(
166
171
# After positional args are removed, move this right below `model`
167
172
task : TaskOption = "auto" ,
168
173
override_pooler_config : Optional [PoolerConfig ] = None ,
174
+ compilation_config : Optional [Union [int , Dict [str , Any ]]] = None ,
169
175
** kwargs ,
170
176
) -> None :
171
177
'''
@@ -178,6 +184,12 @@ def __init__(
178
184
if "disable_log_stats" not in kwargs :
179
185
kwargs ["disable_log_stats" ] = True
180
186
187
+ if compilation_config is not None :
188
+ compilation_config_instance = CompilationConfig .from_cli (
189
+ json .dumps (compilation_config ))
190
+ else :
191
+ compilation_config_instance = None
192
+
181
193
engine_args = EngineArgs (
182
194
model = model ,
183
195
task = task ,
@@ -202,6 +214,7 @@ def __init__(
202
214
hf_overrides = hf_overrides ,
203
215
mm_processor_kwargs = mm_processor_kwargs ,
204
216
override_pooler_config = override_pooler_config ,
217
+ compilation_config = compilation_config_instance ,
205
218
** kwargs ,
206
219
)
207
220
# Logic to switch between engines is done at runtime instead of import
0 commit comments