Skip to content

Commit c1461be

Browse files
[Model] Add user-configurable task for models that support both generation and embedding (vllm-project#9424)
1 parent c383515 commit c1461be

33 files changed

+451
-201
lines changed

docs/source/models/supported_models.rst

+8
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,10 @@ Text Embedding
294294
-
295295
- ✅︎
296296

297+
.. important::
298+
Some model architectures support both generation and embedding tasks.
299+
In this case, you have to pass :code:`--task embedding` to run the model in embedding mode.
300+
297301
Reward Modeling
298302
---------------
299303

@@ -482,6 +486,10 @@ Multimodal Embedding
482486
- 🚧
483487
- ✅︎
484488

489+
.. important::
490+
Some model architectures support both generation and embedding tasks.
491+
In this case, you have to pass :code:`--task embedding` to run the model in embedding mode.
492+
485493
----
486494

487495
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.

docs/source/models/vlm.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,8 @@ Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruc
181181

182182
.. code-block:: bash
183183
184-
vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
185-
--trust-remote-code --limit-mm-per-prompt image=2
184+
vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
185+
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
186186
187187
.. important::
188188
Since OpenAI Vision API is based on `Chat Completions <https://platform.openai.com/docs/api-reference/chat>`_ API,

examples/offline_inference_vision_language_embedding.py

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
# Create an LLM.
88
llm = LLM(
99
model="TIGER-Lab/VLM2Vec-Full",
10+
task="embedding",
1011
trust_remote_code=True,
1112
max_model_len=4096,
1213
max_num_seqs=2,

examples/openai_api_client_for_multimodal.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
88
99
(multi-image inference with Phi-3.5-vision-instruct)
10-
vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
11-
--trust-remote-code --limit-mm-per-prompt image=2
10+
vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
11+
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
1212
1313
(audio inference with Ultravox)
1414
vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096

tests/conftest.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from vllm import LLM, SamplingParams
2626
from vllm.assets.image import ImageAsset
2727
from vllm.assets.video import VideoAsset
28-
from vllm.config import TokenizerPoolConfig
28+
from vllm.config import TaskOption, TokenizerPoolConfig
2929
from vllm.connections import global_http_connection
3030
from vllm.distributed import (destroy_distributed_environment,
3131
destroy_model_parallel,
@@ -619,6 +619,7 @@ class VllmRunner:
619619
def __init__(
620620
self,
621621
model_name: str,
622+
task: TaskOption = "auto",
622623
tokenizer_name: Optional[str] = None,
623624
# Use smaller max model length, otherwise bigger model cannot run due
624625
# to kv cache size limit.
@@ -634,6 +635,7 @@ def __init__(
634635
) -> None:
635636
self.model = LLM(
636637
model=model_name,
638+
task=task,
637639
tokenizer=tokenizer_name,
638640
trust_remote_code=True,
639641
dtype=dtype,

tests/core/test_chunked_prefill_scheduler.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ def test_simple():
3333
num_seq_group = 4
3434
max_model_len = 16
3535
max_num_batched_tokens = 64
36-
scheduler_config = SchedulerConfig(max_num_batched_tokens,
36+
scheduler_config = SchedulerConfig("generate",
37+
max_num_batched_tokens,
3738
num_seq_group,
3839
max_model_len,
3940
enable_chunked_prefill=True)
@@ -78,6 +79,7 @@ def test_chunk():
7879
max_model_len = 80
7980
max_num_batched_tokens = 64
8081
scheduler_config = SchedulerConfig(
82+
"generate",
8183
max_num_batched_tokens,
8284
max_seqs,
8385
max_model_len,
@@ -126,6 +128,7 @@ def test_complex():
126128
max_model_len = 80
127129
max_num_batched_tokens = 64
128130
scheduler_config = SchedulerConfig(
131+
"generate",
129132
max_num_batched_tokens,
130133
max_seqs,
131134
max_model_len,
@@ -196,6 +199,7 @@ def test_maximal_decoding():
196199
max_model_len = 8
197200
max_num_batched_tokens = 2
198201
scheduler_config = SchedulerConfig(
202+
"generate",
199203
max_num_batched_tokens,
200204
max_seqs,
201205
max_model_len,
@@ -289,6 +293,7 @@ def test_prompt_limit():
289293
max_model_len = 64
290294
max_num_batched_tokens = 32
291295
scheduler_config = SchedulerConfig(
296+
"generate",
292297
max_num_batched_tokens,
293298
max_seqs,
294299
max_model_len,
@@ -321,7 +326,8 @@ def test_prompt_limit_exceed():
321326
max_seqs = 64
322327
max_model_len = 32
323328
max_num_batched_tokens = 64
324-
scheduler_config = SchedulerConfig(max_num_batched_tokens,
329+
scheduler_config = SchedulerConfig("generate",
330+
max_num_batched_tokens,
325331
max_seqs,
326332
max_model_len,
327333
enable_chunked_prefill=True)
@@ -348,6 +354,7 @@ def test_swap():
348354
max_model_len = 200
349355
max_num_batched_tokens = 30
350356
scheduler_config = SchedulerConfig(
357+
"generate",
351358
max_num_batched_tokens,
352359
max_seqs,
353360
max_model_len,
@@ -404,6 +411,7 @@ def test_running_prefill_prioritized_over_swap():
404411
max_model_len = 200
405412
max_num_batched_tokens = 30
406413
scheduler_config = SchedulerConfig(
414+
"generate",
407415
max_num_batched_tokens,
408416
max_seqs,
409417
max_model_len,
@@ -498,6 +506,7 @@ def test_chunked_prefill_preempt():
498506
max_model_len = 200
499507
max_num_batched_tokens = 30
500508
scheduler_config = SchedulerConfig(
509+
"generate",
501510
max_num_batched_tokens,
502511
max_seqs,
503512
max_model_len,
@@ -563,6 +572,7 @@ def test_chunked_prefill_max_seqs():
563572
max_model_len = 80
564573
max_num_batched_tokens = 64
565574
scheduler_config = SchedulerConfig(
575+
"generate",
566576
max_num_batched_tokens,
567577
max_seqs,
568578
max_model_len,
@@ -617,6 +627,7 @@ def test_perfix_caching():
617627
max_model_len = 80
618628
max_num_batched_tokens = 64
619629
scheduler_config = SchedulerConfig(
630+
"generate",
620631
max_num_batched_tokens,
621632
max_seqs,
622633
max_model_len,

tests/core/test_scheduler.py

+32-24
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,10 @@
2020
def test_scheduler_add_seq_group():
2121
block_size = 4
2222
scheduler_config = SchedulerConfig(
23-
100,
24-
64,
25-
1,
23+
"generate",
24+
max_num_batched_tokens=100,
25+
max_num_seqs=64,
26+
max_model_len=1,
2627
)
2728
cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
2829
cache_config.num_cpu_blocks = 4
@@ -42,9 +43,10 @@ def test_scheduler_add_seq_group():
4243
def test_scheduler_abort_seq_group():
4344
block_size = 4
4445
scheduler_config = SchedulerConfig(
45-
100,
46-
64,
47-
1,
46+
"generate",
47+
max_num_batched_tokens=100,
48+
max_num_seqs=64,
49+
max_model_len=1,
4850
)
4951
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
5052
cache_config.num_cpu_blocks = 4
@@ -70,9 +72,10 @@ def test_scheduler_schedule_simple():
7072
num_seq_group = 4
7173
max_model_len = 16
7274
scheduler_config = SchedulerConfig(
73-
64,
74-
num_seq_group,
75-
max_model_len,
75+
"generate",
76+
max_num_batched_tokens=64,
77+
max_num_seqs=num_seq_group,
78+
max_model_len=max_model_len,
7679
)
7780
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
7881
cache_config.num_cpu_blocks = 8
@@ -114,9 +117,10 @@ def test_scheduler_prefill_prioritized():
114117
max_model_len = 30
115118
max_batched_num_tokens = 30
116119
scheduler_config = SchedulerConfig(
117-
max_batched_num_tokens,
118-
2,
119-
max_model_len,
120+
"generate",
121+
max_num_batched_tokens=max_batched_num_tokens,
122+
max_num_seqs=2,
123+
max_model_len=max_model_len,
120124
)
121125
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
122126
cache_config.num_cpu_blocks = 16
@@ -145,9 +149,10 @@ def test_scheduler_schedule_preempt_abort():
145149
block_size = 4
146150
max_model_len = 16
147151
scheduler_config = SchedulerConfig(
148-
64,
149-
2,
150-
max_model_len,
152+
"generate",
153+
max_num_batched_tokens=64,
154+
max_num_seqs=2,
155+
max_model_len=max_model_len,
151156
)
152157
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
153158
cache_config.num_cpu_blocks = 2
@@ -204,9 +209,10 @@ def test_scheduler_max_seqs():
204209
max_seq_group = 2
205210
max_model_len = 16
206211
scheduler_config = SchedulerConfig(
207-
64,
208-
max_seq_group,
209-
max_model_len,
212+
"generate",
213+
max_num_batched_tokens=64,
214+
max_num_seqs=max_seq_group,
215+
max_model_len=max_model_len,
210216
)
211217
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
212218
cache_config.num_cpu_blocks = 8
@@ -248,9 +254,10 @@ def test_scheduler_max_seqs():
248254
def test_scheduler_delay_factor():
249255
block_size = 4
250256
scheduler_config = SchedulerConfig(
251-
100,
252-
64,
253-
16,
257+
"generate",
258+
max_num_batched_tokens=100,
259+
max_num_seqs=64,
260+
max_model_len=16,
254261
delay_factor=0.5,
255262
)
256263
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
@@ -350,9 +357,10 @@ def initialize_scheduler(
350357
):
351358
block_size = block_size
352359
scheduler_config = SchedulerConfig(
353-
max_token_budget,
354-
max_num_seqs,
355-
max_model_len,
360+
"generate",
361+
max_num_batched_tokens=max_token_budget,
362+
max_num_seqs=max_num_seqs,
363+
max_model_len=max_model_len,
356364
)
357365
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
358366
cache_config.num_cpu_blocks = num_cpu_blocks

tests/core/test_scheduler_encoder_decoder.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,12 @@ def test_scheduler_schedule_simple_encoder_decoder():
3636
block_size = 4
3737
num_seq_group = 4
3838
max_model_len = 16
39-
scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len)
39+
scheduler_config = SchedulerConfig(
40+
task="generate",
41+
max_num_batched_tokens=64,
42+
max_num_seqs=num_seq_group,
43+
max_model_len=max_model_len,
44+
)
4045
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
4146
cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group
4247
cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group

0 commit comments

Comments
 (0)