Revert changes in data.py and scheduler.py

pooyadavoodi · pooyadavoodi · commit 1aec4d37acdc · 2024-12-06T08:53:23.000+09:00
Signed-off-by: Pooya Davoodi &lt;pooya.davoodi@parasail.io&gt;
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
@@ -12,7 +12,6 @@
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
                            SequenceGroupMetadata, SequenceGroupMetadataDelta,
@@ -524,7 +523,7 @@ def _schedule_running(
                 chunked number of tokens are scheduled  if
                 `budget.num_batched_tokens` has not enough capacity to schedule
                 all tokens.
-
+    
         Returns:
             SchedulerRunningOutputs.
         """
@@ -842,10 +841,10 @@ def _schedule_priority_preemption(
                 self._get_num_new_uncached_and_cached_tokens(
                     seq_group, SequenceStatus.WAITING, False, budget))
 
-            # Only preempt if priority inversion exists
+            #Only preempt if priority inversion exists
             while running_queue and self._get_priority(
                     running_queue[-1]) > self._get_priority(seq_group):
-                # Only preempt if waiting sequence cannot be allocated
+                #Only preempt if waiting sequence cannot be allocated
                 can_allocate = self.block_manager.can_allocate(seq_group)
                 if (num_new_tokens_uncached > 0
                         and can_allocate == AllocStatus.OK
@@ -855,7 +854,7 @@ def _schedule_priority_preemption(
                         )):
                     break
 
-                # Adjust budget to remove the victim sequence group
+                #Adjust budget to remove the victim sequence group
                 vseq_group = running_queue.pop()
                 num_running_tokens_uncached, _ = (
                     self._get_num_new_uncached_and_cached_tokens(
@@ -866,11 +865,11 @@ def _schedule_priority_preemption(
                 budget.subtract_num_seqs(vseq_group.request_id,
                                          num_running_seqs)
 
-                # Preempt out the victim sequence group
+                #Preempt out the victim sequence group
                 self._preempt(vseq_group, blocks_to_swap_out)
                 waiting_queue.appendleft(vseq_group)
                 force_preemption_count += 1
-            # Put the sequence back into the waiting queue
+            #Put the sequence back into the waiting queue
             waiting_queue.appendleft(seq_group)
 
         waiting_queue = deque(sorted(waiting_queue, key=self._get_priority))
@@ -1037,7 +1036,7 @@ def _schedule_prefills(
 
     def _schedule_default(self) -> SchedulerOutputs:
         """Schedule queued requests.
-
+        
         The current policy is designed to optimize the throughput. First,
         it batches as many prefill requests as possible. And it schedules
         decodes. If there's a pressure on GPU memory, decode requests can
@@ -1142,7 +1141,7 @@ def _schedule_default(self) -> SchedulerOutputs:
 
     def _schedule_chunked_prefill(self) -> SchedulerOutputs:
         """Schedule queued requests.
-
+        
         Chunked prefill allows to chunk prefill requests, batch them together
         with decode requests. This policy 1. schedule as many decoding requests
         as possible. 2. schedule chunked prefill requests that are not
@@ -1351,25 +1350,6 @@ def schedule(
                         seqs[0].data.get_len()):
                     do_sample = False
 
-            pooling_params = seq_group.pooling_params
-
-            # Store instruction_seq in pooling_params.
-            instruction_seq = seq.inputs.inputs.get("instruction_seq")
-            if instruction_seq is not None:
-                if pooling_params is None:
-                    pooling_params = PoolingParams()
-                    pooling_params.additional_data = {
-                        "instruction_seq": instruction_seq
-                    }
-                elif pooling_params.additional_data is None:
-                    pooling_params.additional_data = {
-                        "instruction_seq": instruction_seq
-                    }
-                else:
-                    pooling_params.additional_data[
-                        "instruction_seq"] = seq.inputs.inputs.get(
-                            "instruction_seq")
-
             # It assumes the scheduled_seq_groups is ordered by
             # prefill < decoding.
             if is_first_prefill or not self.scheduler_config.send_delta_data:
@@ -1380,7 +1360,7 @@ def schedule(
                     sampling_params=seq_group.sampling_params,
                     block_tables=block_tables,
                     do_sample=do_sample,
-                    pooling_params=pooling_params,
+                    pooling_params=seq_group.pooling_params,
                     token_chunk_size=token_chunk_size,
                     lora_request=seq_group.lora_request,
                     computed_block_nums=common_computed_block_nums,
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
@@ -170,14 +170,6 @@ class TokenInputs(TypedDict):
     to pass the mm_processor_kwargs to each of them.
     """
 
-    instruction_seq: NotRequired[Optional[str]]
-    """
-    The instruction sequence that is usually prepended to the original prompt
-    when passing to the model. Certain models need to extract this instruction
-    sequence from the prompt in order to adjust certain operations of the
-    model such as the attention mask.
-    """
-
 
 def token_inputs(
     prompt_token_ids: List[int],
@@ -187,7 +179,6 @@ def token_inputs(
     multi_modal_inputs: Optional["MultiModalKwargs"] = None,
     multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
     mm_processor_kwargs: Optional[Dict[str, Any]] = None,
-    instruction_seq: Optional[str] = None,
 ) -> TokenInputs:
     """Construct :class:`TokenInputs` from optional values."""
     inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
@@ -204,8 +195,6 @@ def token_inputs(
         inputs["multi_modal_placeholders"] = multi_modal_placeholders
     if mm_processor_kwargs is not None:
         inputs["mm_processor_kwargs"] = mm_processor_kwargs
-    if instruction_seq is not None:
-        inputs["instruction_seq"] = instruction_seq
 
     return inputs