12
12
from vllm .core .interfaces import AllocStatus , BlockSpaceManager
13
13
from vllm .logger import init_logger
14
14
from vllm .lora .request import LoRARequest
15
- from vllm .pooling_params import PoolingParams
16
15
from vllm .prompt_adapter .request import PromptAdapterRequest
17
16
from vllm .sequence import (Sequence , SequenceData , SequenceGroup ,
18
17
SequenceGroupMetadata , SequenceGroupMetadataDelta ,
@@ -524,7 +523,7 @@ def _schedule_running(
524
523
chunked number of tokens are scheduled if
525
524
`budget.num_batched_tokens` has not enough capacity to schedule
526
525
all tokens.
527
-
526
+
528
527
Returns:
529
528
SchedulerRunningOutputs.
530
529
"""
@@ -842,10 +841,10 @@ def _schedule_priority_preemption(
842
841
self ._get_num_new_uncached_and_cached_tokens (
843
842
seq_group , SequenceStatus .WAITING , False , budget ))
844
843
845
- # Only preempt if priority inversion exists
844
+ #Only preempt if priority inversion exists
846
845
while running_queue and self ._get_priority (
847
846
running_queue [- 1 ]) > self ._get_priority (seq_group ):
848
- # Only preempt if waiting sequence cannot be allocated
847
+ #Only preempt if waiting sequence cannot be allocated
849
848
can_allocate = self .block_manager .can_allocate (seq_group )
850
849
if (num_new_tokens_uncached > 0
851
850
and can_allocate == AllocStatus .OK
@@ -855,7 +854,7 @@ def _schedule_priority_preemption(
855
854
)):
856
855
break
857
856
858
- # Adjust budget to remove the victim sequence group
857
+ #Adjust budget to remove the victim sequence group
859
858
vseq_group = running_queue .pop ()
860
859
num_running_tokens_uncached , _ = (
861
860
self ._get_num_new_uncached_and_cached_tokens (
@@ -866,11 +865,11 @@ def _schedule_priority_preemption(
866
865
budget .subtract_num_seqs (vseq_group .request_id ,
867
866
num_running_seqs )
868
867
869
- # Preempt out the victim sequence group
868
+ #Preempt out the victim sequence group
870
869
self ._preempt (vseq_group , blocks_to_swap_out )
871
870
waiting_queue .appendleft (vseq_group )
872
871
force_preemption_count += 1
873
- # Put the sequence back into the waiting queue
872
+ #Put the sequence back into the waiting queue
874
873
waiting_queue .appendleft (seq_group )
875
874
876
875
waiting_queue = deque (sorted (waiting_queue , key = self ._get_priority ))
@@ -1037,7 +1036,7 @@ def _schedule_prefills(
1037
1036
1038
1037
def _schedule_default (self ) -> SchedulerOutputs :
1039
1038
"""Schedule queued requests.
1040
-
1039
+
1041
1040
The current policy is designed to optimize the throughput. First,
1042
1041
it batches as many prefill requests as possible. And it schedules
1043
1042
decodes. If there's a pressure on GPU memory, decode requests can
@@ -1142,7 +1141,7 @@ def _schedule_default(self) -> SchedulerOutputs:
1142
1141
1143
1142
def _schedule_chunked_prefill (self ) -> SchedulerOutputs :
1144
1143
"""Schedule queued requests.
1145
-
1144
+
1146
1145
Chunked prefill allows to chunk prefill requests, batch them together
1147
1146
with decode requests. This policy 1. schedule as many decoding requests
1148
1147
as possible. 2. schedule chunked prefill requests that are not
@@ -1351,25 +1350,6 @@ def schedule(
1351
1350
seqs [0 ].data .get_len ()):
1352
1351
do_sample = False
1353
1352
1354
- pooling_params = seq_group .pooling_params
1355
-
1356
- # Store instruction_seq in pooling_params.
1357
- instruction_seq = seq .inputs .inputs .get ("instruction_seq" )
1358
- if instruction_seq is not None :
1359
- if pooling_params is None :
1360
- pooling_params = PoolingParams ()
1361
- pooling_params .additional_data = {
1362
- "instruction_seq" : instruction_seq
1363
- }
1364
- elif pooling_params .additional_data is None :
1365
- pooling_params .additional_data = {
1366
- "instruction_seq" : instruction_seq
1367
- }
1368
- else :
1369
- pooling_params .additional_data [
1370
- "instruction_seq" ] = seq .inputs .inputs .get (
1371
- "instruction_seq" )
1372
-
1373
1353
# It assumes the scheduled_seq_groups is ordered by
1374
1354
# prefill < decoding.
1375
1355
if is_first_prefill or not self .scheduler_config .send_delta_data :
@@ -1380,7 +1360,7 @@ def schedule(
1380
1360
sampling_params = seq_group .sampling_params ,
1381
1361
block_tables = block_tables ,
1382
1362
do_sample = do_sample ,
1383
- pooling_params = pooling_params ,
1363
+ pooling_params = seq_group . pooling_params ,
1384
1364
token_chunk_size = token_chunk_size ,
1385
1365
lora_request = seq_group .lora_request ,
1386
1366
computed_block_nums = common_computed_block_nums ,
0 commit comments