From 938ef83ac4288b77d43b72fd7cf8ff700c38639b Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Thu, 6 Feb 2025 07:38:41 +0000
Subject: [PATCH 01/34] Initial commit

Fails in rotary_embed layer in the view
---
 .../model_executor/layers/rotary_embedding.py | 20 +++++
 vllm/worker/hpu_model_runner.py               | 86 +++++++++++++++++--
 2 files changed, 100 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 6ebab8927a92b..49d6b0ac13134 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -849,6 +849,26 @@ def forward(
                             dim=-1)
 
         query_shape = query.shape
+        breakpoint()
+        '''
+        in CPU:
+
+        query.shape
+        torch.Size([1451, 3584])
+
+         num_tokens
+        1451
+        (Pdb) self.head_size
+        128
+
+        on HPU:
+        query.shape
+        torch.Size([32, 1024, 3584])
+        (Pdb) num_tokens
+        1024
+        (Pdb) self.head_size
+        128
+        '''
         query = query.view(num_tokens, -1, self.head_size)
         query_rot = query[..., :self.rotary_dim]
         query_pass = query[..., self.rotary_dim:]
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 276685274b957..ed60f836ed51a 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -43,6 +43,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -421,9 +422,11 @@ def _prepare_cos_sin(self, positions):
             current_module.prepare_cos_sin(
                 positions, recompute_cos_sin=self.recompute_cos_sin)
         else:
-            raise AttributeError(
-                "The module at the end of the path does not have \
-                a 'prepare_cos_sin' method.")
+            pass
+            # dont raise error for qwen2.5-vl
+            #raise AttributeError(
+            #    "The module at the end of the path does not have \
+            #    a 'prepare_cos_sin' method.")
 
     def forward(self, *args, **kwargs):
         kwargs = kwargs.copy()
@@ -759,6 +762,15 @@ def _set_gc_threshold(self) -> None:
         self.skip_warmup = os.environ.get('VLLM_SKIP_WARMUP',
                                           'false').lower() == 'true'
 
+    @property
+    def model_is_mrope(self) -> bool:
+        """Detect if the model has "mrope" rope_scaling type.
+        mrope requires keep "rope_deltas" between prompt and decoding phases."""
+        rope_scaling = getattr(self.model_config.hf_config, "rope_scaling", {})
+        if rope_scaling is None:
+            return False
+        return rope_scaling.get("type", None) == "mrope"
+
     def load_model(self) -> None:
         import habana_frameworks.torch.core as htcore
         if self.model_config.quantization == 'inc' or \
@@ -935,6 +947,7 @@ def _prepare_prompt(
     ) -> PreparePromptMetadata:
         input_tokens: List[List[int]] = []
         input_positions: List[List[int]] = []
+        input_mrope_positions: List[List[int]] = [[] for _ in range(3)]
         slot_mapping: List[List[int]] = []
         lora_index_mapping: List[List[int]] = []
         lora_prompt_mapping: List[List[int]] = []
@@ -1019,6 +1032,37 @@ def _prepare_prompt(
                         seq_group_metadata.mm_processor_kwargs,
                     )
 
+                mrope_positions = None
+                if self.runner.model_is_mrope:
+                    image_grid_thw = mm_kwargs.get("image_grid_thw", None)
+                    video_grid_thw = mm_kwargs.get("video_grid_thw", None)
+                    assert image_grid_thw is not None or video_grid_thw is not None, (
+                        "mrope embedding type requires multi-modal input mapper "
+                        "returns 'image_grid_thw' or 'video_grid_thw'.")
+
+                    hf_config = self.runner.model_config.hf_config
+                    token_ids = seq_data.get_token_ids()
+                    mrope_positions, mrope_position_delta = \
+                        MRotaryEmbedding.get_input_positions(
+                            token_ids,
+                            image_grid_thw=image_grid_thw,
+                            video_grid_thw=video_grid_thw,
+                            image_token_id=hf_config.image_token_id,
+                            video_token_id=hf_config.video_token_id,
+                            vision_start_token_id=hf_config.vision_start_token_id,
+                            vision_end_token_id=hf_config.vision_end_token_id,
+                            spatial_merge_size=hf_config.vision_config.
+                            spatial_merge_size,
+                            context_len=computed_len,
+                        )
+                    seq_data.mrope_position_delta = mrope_position_delta
+                if mrope_positions:
+                    for idx in range(3):
+                        input_mrope_positions[idx].extend(mrope_positions[idx])
+                else:
+                    input_positions.extend(list(range(computed_len, seq_len)))
+
+
                 multi_modal_kwargs_list.append(mm_kwargs)
 
                 for modality, placeholder_map in placeholder_maps.items():
@@ -1058,6 +1102,11 @@ def _prepare_prompt(
                     slot = block_number * self.block_size + block_offset
                     slot_mapping[-1].append(slot)
 
+        if any(input_mrope_positions):
+            input_positions = None  # type: ignore
+        else:
+            input_mrope_positions = None  # type: ignore
+
         max_query_len = max(query_lens)
         real_num_seqs = len(query_lens)
 
@@ -1110,7 +1159,7 @@ def _prepare_prompt(
                                                    dtype=torch.long,
                                                    device='cpu')
 
-        input_positions = make_tensor_with_pad(input_positions,
+        input_positions = make_tensor_with_pad(input_positions or input_mrope_positions,
                                                max_len=max_prompt_len,
                                                pad=0,
                                                dtype=torch.long,
@@ -1196,6 +1245,7 @@ def _prepare_decode(
     ) -> PrepareDecodeMetadata:
         input_tokens: List[List[int]] = []
         input_positions: List[List[int]] = []
+        input_mrope_positions: List[List[int]] = [[] for _ in range(3)]
         slot_mapping: List[List[int]] = []
         seq_lens: List[int] = []
         encoder_seq_lens: List[int] = []
@@ -1241,7 +1291,17 @@ def _prepare_decode(
 
                 seq_len = seq_data.get_len()
                 position = seq_len - 1
-                input_positions.append([position])
+                if seq_data.mrope_position_delta is not None:
+                    context_len = seq_data.get_num_computed_tokens()
+                    next_pos = MRotaryEmbedding.get_next_input_positions(
+                        seq_data.mrope_position_delta,
+                        context_len,
+                        seq_len,
+                    )
+                    for idx in range(3):
+                        input_mrope_positions[idx].extend(next_pos[idx])
+                else:
+                    input_positions.append(position)
 
                 seq_len = seq_len if self.sliding_window is None else min(
                     seq_len, self.sliding_window)
@@ -1264,6 +1324,20 @@ def _prepare_decode(
                 lora_index_mapping.append(lora_id)
                 lora_prompt_mapping.append(lora_id)
 
+                #sasarkar this bit isnt there in the latest cpu code. maybe subsumed by:
+                '''
+                 input_positions = torch.tensor(
+            input_data.input_positions
+            if not any(input_data.input_mrope_positions) else
+            input_data.input_mrope_positions,
+            dtype=torch.long,
+            device="cpu")
+                '''
+                if any(input_mrope_positions):
+                    input_positions = None  # type: ignore
+                else:
+                    input_mrope_positions = None  # type: ignore
+
                 if self.sliding_window is not None:
                     sliding_window_blocks = (self.sliding_window //
                                              self.block_size)
@@ -1278,7 +1352,7 @@ def _prepare_decode(
             real_batch_size = len(seq_group_metadata_list)
             input_tokens = output[:real_batch_size].clone()
 
-        input_positions = torch.tensor(input_positions,
+        input_positions = torch.tensor(input_positions or input_mrope_positions,
                                        dtype=torch.long,
                                        device='cpu')
 

From a3f884b3b68cedd33b98260d60c90068e110694b Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Thu, 6 Feb 2025 09:03:36 +0000
Subject: [PATCH 02/34] Comments to trace execution diff between cpu/hpu

---
 vllm/worker/hpu_model_runner.py | 38 +++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index ed60f836ed51a..e04cf3c49143c 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1018,6 +1018,21 @@ def _prepare_prompt(
             # is always the first token in the sequence.
             input_positions.append(list(range(context_len, seq_len)))
 
+            '''
+            seq_group_metadata.multi_modal_data is None, so we dont enter this
+            hence multi_modal_kwargs_list isnt populated
+
+            on cpu its:
+             seq_group_metadata.multi_modal_data
+{'pixel_values': tensor([[-1.1061, -1.1061, -1.1061,  ..., -1.4518, -1.4518, -1.4518],
+        [-1.1207, -1.1207, -1.1207,  ..., -1.4376, -1.4376, -1.4376],
+        [-1.1353, -1.1353, -1.1353,  ..., -1.4376, -1.4376, -1.4376],
+        ...,
+        [ 1.1128,  0.9668,  0.8792,  ...,  0.8945,  1.1221,  1.3496],
+        [ 0.9230,  1.2004,  1.3902,  ...,  0.7950,  0.3542,  0.2973],
+        [ 0.9814,  0.9376,  1.0836,  ...,  1.2643,  1.1789,  1.1363]]), 'image_grid_thw': tensor([[ 1, 62, 92]])}
+
+            '''
             if seq_group_metadata.multi_modal_data:
                 positions = input_positions[0]
                 mm_data, placeholder_maps = MultiModalPlaceholderMap \
@@ -1586,11 +1601,10 @@ def prepare_input_tensors(
             decode_slot_mapping,
             decode_lora_ids,
         ) = self._prepare_decode(decode_reqs)
-
-        if not self.is_pooler:
-            sampling_metadata = SamplingMetadata.prepare(
-                seq_group_metadata_list, seq_lens, query_lens, self.device,
-                self.pin_memory)
+        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
+                                                     seq_lens, query_lens,
+                                                     self.device,
+                                                     self.pin_memory)
 
         if not self.scheduler_config.chunked_prefill_enabled:
             assert (len(prefill_reqs) and len(decode_reqs)) == 0
@@ -1774,12 +1788,18 @@ def create_dummy_seq_group_metadata(self,
         prompt_token_ids_array = array('l', prompt_token_ids)  # noqa: F821
         seq_data = SequenceData(prompt_token_ids_array)
         seq_data.output_token_ids = output_token_ids
-        return SequenceGroupMetadata(request_id=str(group_id),
+        x = SequenceGroupMetadata(request_id=str(group_id),
                                      is_prompt=(output_len == 0),
                                      seq_data={group_id: seq_data},
                                      sampling_params=sampling_params,
                                      block_tables=block_tables,
                                      lora_request=lora_request)
+        '''
+        x.multi_modal_data is empty.... 
+        we need to pass in some dummy here.
+        I think llama3.2VL is working, how is it working if this is empty?.. need to track llama3.2vl status
+        '''
+        return x
 
     def profile_run(self) -> None:
         num_layers = self.model_config.get_num_layers(self.parallel_config)
@@ -1864,6 +1884,9 @@ def warmup_scenario(self,
             profiler.start()
         for _ in range(times):
             inputs = self.prepare_model_input(seqs)
+            '''
+            sasarkar: at this point inputs.multi_modal_kwargs.keys() is empty.. thats not good
+            '''
             is_single_step = \
                 self.vllm_config.scheduler_config.num_scheduler_steps == 1
             if is_prompt or is_single_step:
@@ -2328,6 +2351,9 @@ def prepare_model_input(
                     seq_group_metadata_list=seq_group_metadata_list)
             model_input, sampling_metadata = self.prepare_input_tensors(
                 seq_group_metadata_list)
+            '''
+            sasarkar: model_input.multi_modal_kwargs empty here.. not good
+            '''
             assert model_input.attn_metadata is not None
             is_prompt = model_input.attn_metadata.is_prompt
 

From c83c882dfd6daf771c5dbcd5d40be09e6ffb77d0 Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Fri, 7 Feb 2025 07:20:02 +0000
Subject: [PATCH 03/34] minor

---
 vllm/worker/hpu_model_runner.py | 51 ++++++++++++++++++++++++++-------
 1 file changed, 41 insertions(+), 10 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index e04cf3c49143c..4915fc45110c2 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -767,6 +767,7 @@ def model_is_mrope(self) -> bool:
         """Detect if the model has "mrope" rope_scaling type.
         mrope requires keep "rope_deltas" between prompt and decoding phases."""
         rope_scaling = getattr(self.model_config.hf_config, "rope_scaling", {})
+        breakpoint()
         if rope_scaling is None:
             return False
         return rope_scaling.get("type", None) == "mrope"
@@ -965,6 +966,7 @@ def _prepare_prompt(
         if len(seq_group_metadata_list) == 0:
             return PreparePromptMetadata.empty()
 
+        #breakpoint()
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
             seq_ids = list(seq_group_metadata.seq_data.keys())
@@ -1035,6 +1037,7 @@ def _prepare_prompt(
             '''
             if seq_group_metadata.multi_modal_data:
                 positions = input_positions[0]
+                #breakpoint()
                 mm_data, placeholder_maps = MultiModalPlaceholderMap \
                     .from_seq_group(seq_group_metadata,
                       range(positions[0], positions[0] + len(positions)))
@@ -1048,14 +1051,16 @@ def _prepare_prompt(
                     )
 
                 mrope_positions = None
-                if self.runner.model_is_mrope:
+                #breakpoint()
+                if self.model_config.uses_mrope:
+                #if self.model_is_mrope:  # this returns false... rope_scaling.get("type", None) == "mrope" fails as it is "default"
                     image_grid_thw = mm_kwargs.get("image_grid_thw", None)
                     video_grid_thw = mm_kwargs.get("video_grid_thw", None)
                     assert image_grid_thw is not None or video_grid_thw is not None, (
                         "mrope embedding type requires multi-modal input mapper "
                         "returns 'image_grid_thw' or 'video_grid_thw'.")
 
-                    hf_config = self.runner.model_config.hf_config
+                    hf_config = self.model_config.hf_config
                     token_ids = seq_data.get_token_ids()
                     mrope_positions, mrope_position_delta = \
                         MRotaryEmbedding.get_input_positions(
@@ -1068,14 +1073,18 @@ def _prepare_prompt(
                             vision_end_token_id=hf_config.vision_end_token_id,
                             spatial_merge_size=hf_config.vision_config.
                             spatial_merge_size,
-                            context_len=computed_len,
+                            context_len=context_len,
                         )
                     seq_data.mrope_position_delta = mrope_position_delta
+                #breakpoint()
+                '''
+                Hpu: mrope_positions 3x1024 .. 32 of the outer loop, so 1024*32 ...
+                '''
                 if mrope_positions:
                     for idx in range(3):
                         input_mrope_positions[idx].extend(mrope_positions[idx])
                 else:
-                    input_positions.extend(list(range(computed_len, seq_len)))
+                    input_positions.extend(list(range(context_len, seq_len)))
 
 
                 multi_modal_kwargs_list.append(mm_kwargs)
@@ -1174,6 +1183,10 @@ def _prepare_prompt(
                                                    dtype=torch.long,
                                                    device='cpu')
 
+        #breakpoint()
+        #input_mrope_positions : list: 3x32768
+        # max_prompt_len: max_prompt_len
+        # in CPU this is: torch.Size([3, 1451])
         input_positions = make_tensor_with_pad(input_positions or input_mrope_positions,
                                                max_len=max_prompt_len,
                                                pad=0,
@@ -1788,12 +1801,27 @@ def create_dummy_seq_group_metadata(self,
         prompt_token_ids_array = array('l', prompt_token_ids)  # noqa: F821
         seq_data = SequenceData(prompt_token_ids_array)
         seq_data.output_token_ids = output_token_ids
-        x = SequenceGroupMetadata(request_id=str(group_id),
-                                     is_prompt=(output_len == 0),
-                                     seq_data={group_id: seq_data},
-                                     sampling_params=sampling_params,
-                                     block_tables=block_tables,
-                                     lora_request=lora_request)
+        # sasarkar, unify if-else later
+        if self.model_config.uses_mrope:
+            # sasarkar: hard coded img shape. what should it be in general?
+            multi_modal_data_dummy = MultiModalKwargs({'pixel_values': torch.rand([5704, 1176]), 'image_grid_thw': torch.tensor([[ 1, 62, 92]])})
+            x = SequenceGroupMetadata(request_id=str(group_id),
+                                        is_prompt=(output_len == 0),
+                                        seq_data={group_id: seq_data},
+                                        sampling_params=sampling_params,
+                                        block_tables=block_tables,
+                                        lora_request=lora_request,
+                                        multi_modal_data=multi_modal_data_dummy,
+                                        mm_processor_kwargs={},
+                                        multi_modal_placeholders={'image': [{'offset': 15, 'length': 1426}]}) # sasarkar.. remove hardcoded nums
+        else:
+            x = SequenceGroupMetadata(request_id=str(group_id),
+                                        is_prompt=(output_len == 0),
+                                        seq_data={group_id: seq_data},
+                                        sampling_params=sampling_params,
+                                        block_tables=block_tables,
+                                        lora_request=lora_request)
+        #breakpoint()
         '''
         x.multi_modal_data is empty.... 
         we need to pass in some dummy here.
@@ -1835,6 +1863,7 @@ def warmup_scenario(self,
         # passed in, which contains a lora from the lora warmup path.
         dummy_lora_requests: List[LoRARequest] = []
         dummy_lora_requests_per_seq: List[LoRARequest] = []
+        #breakpoint()
         if self.lora_config and is_lora_profile_run:
             assert self.lora_manager is not None
             with self.lora_manager.dummy_lora_cache():
@@ -1854,6 +1883,7 @@ def warmup_scenario(self,
                 ]
         self.profiler.start('internal', scenario_name)
         times = 3 if use_graphs or is_pt_profiler_run else 1
+        #breakpoint()
         if is_prompt:
             seqs = [
                 self.create_dummy_seq_group_metadata(
@@ -1887,6 +1917,7 @@ def warmup_scenario(self,
             '''
             sasarkar: at this point inputs.multi_modal_kwargs.keys() is empty.. thats not good
             '''
+            #breakpoint()
             is_single_step = \
                 self.vllm_config.scheduler_config.num_scheduler_steps == 1
             if is_prompt or is_single_step:

From c5f65f936dd1f4e5bba0f01a4e99322a2749c0d4 Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Fri, 7 Feb 2025 08:41:04 +0000
Subject: [PATCH 04/34] _validate_and_reshape_mm_tensor looks buggy...

bypassing it with alternative pt code
else it was editing image_grid_thw to 0,0,0 etc
---
 vllm/model_executor/models/qwen2_vl.py | 30 ++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 961f53cef1379..a457fbeec6faf 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -595,6 +595,7 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
                 torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
         pos_ids = torch.cat(pos_ids, dim=0)
         max_grid_size = grid_thw[:, 1:].max()
+        breakpoint()
         rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         return rotary_pos_emb
@@ -1112,6 +1113,14 @@ def _validate_and_reshape_mm_tensor(self, mm_input: object,
                 raise ValueError(f"{name} should be 2D or batched 3D tensor. "
                                  f"Got ndim: {mm_input.ndim} "
                                  f"(shape={mm_input.shape})")
+            # sasarkar buggy
+            '''
+            (Pdb) (list(mm_input))
+[tensor([[ 1, 62, 92]], device='hpu:0')]
+(Pdb) torch.concat(list(mm_input))
+tensor([[0, 0, 0]], device='hpu:0')
+
+            '''
             return torch.concat(list(mm_input))
         else:
             return torch.concat(mm_input)
@@ -1126,10 +1135,13 @@ def _parse_and_validate_image_input(
             return None
 
         if pixel_values is not None:
-            pixel_values = self._validate_and_reshape_mm_tensor(
-                pixel_values, "image pixel values")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, "image grid_thw")
+            # sasarkar: _validate_and_reshape_mm_tensor seems to be messing up the values some how
+            #pixel_values = self._validate_and_reshape_mm_tensor(
+            #    pixel_values, "image pixel values")
+            #image_grid_thw = self._validate_and_reshape_mm_tensor(
+            #    image_grid_thw, "image grid_thw")
+            pixel_values = pixel_values.view(-1, pixel_values.shape[-1])
+            image_grid_thw = image_grid_thw.view(-1, image_grid_thw.shape[-1])
 
             if not isinstance(pixel_values, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image pixel values. "
@@ -1140,6 +1152,7 @@ def _parse_and_validate_image_input(
                                            image_grid_thw=image_grid_thw)
 
         if image_embeds is not None:
+            assert False, "Call me if this is hit"
             image_embeds = self._validate_and_reshape_mm_tensor(
                 image_embeds, "image embeds")
             image_grid_thw = self._validate_and_reshape_mm_tensor(
@@ -1357,6 +1370,15 @@ def forward(
                     video_input=video_input)
                 input_ids = None
 
+        '''
+        During "warmup": ... have switched off warmup memory for now
+        hpu:
+        input_ids, positions: 32x1024
+
+        CPU:
+        input_ids is None
+        positions: 3x1451:
+        '''
         hidden_states = self.language_model.model(
             input_ids=input_ids,
             positions=positions,

From fca160d28f59ddb7b4259860e54206c461479753 Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Fri, 7 Feb 2025 08:57:19 +0000
Subject: [PATCH 05/34] Some comments regd buggy hpu graphs

running if we use enforce_eager:
llm = LLM(model="Qwen/Qwen2-VL-7B-Instruct", enforce_eager=True)
---
 vllm/model_executor/models/qwen2_vl.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index a457fbeec6faf..786ab2177e59d 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -580,7 +580,7 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
             hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
             wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
             hpos_ids = hpos_ids.reshape(
-                h // self.spatial_merge_size,
+                h // self.spatial_merge_size,  # buggy... 62/2 is yielding 0 .. seems its ok wo hpu graphs
                 self.spatial_merge_size,
                 w // self.spatial_merge_size,
                 self.spatial_merge_size,
@@ -595,7 +595,6 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
                 torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
         pos_ids = torch.cat(pos_ids, dim=0)
         max_grid_size = grid_thw[:, 1:].max()
-        breakpoint()
         rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         return rotary_pos_emb
@@ -1113,7 +1112,7 @@ def _validate_and_reshape_mm_tensor(self, mm_input: object,
                 raise ValueError(f"{name} should be 2D or batched 3D tensor. "
                                  f"Got ndim: {mm_input.ndim} "
                                  f"(shape={mm_input.shape})")
-            # sasarkar buggy
+            # sasarkar buggy  ... seems an issue with hpu graph?
             '''
             (Pdb) (list(mm_input))
 [tensor([[ 1, 62, 92]], device='hpu:0')]

From 095dbbd916146186bbfacb84dfebc39b7161f3ae Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Fri, 7 Feb 2025 09:09:14 +0000
Subject: [PATCH 06/34] Return early to prevent mem profiling

---
 vllm/model_executor/layers/rotary_embedding.py | 2 +-
 vllm/worker/hpu_model_runner.py                | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 49d6b0ac13134..9f0e770c40be1 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -849,7 +849,7 @@ def forward(
                             dim=-1)
 
         query_shape = query.shape
-        breakpoint()
+        #breakpoint()
         '''
         in CPU:
 
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 4915fc45110c2..4dc4c0d3281a2 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1830,6 +1830,7 @@ def create_dummy_seq_group_metadata(self,
         return x
 
     def profile_run(self) -> None:
+        return
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
         bind_kv_cache(

From f557d9968e76ab5ea4eaec7013c79447f4702d27 Mon Sep 17 00:00:00 2001
From: pallavi jaini <pallavi.jaini@intel.com>
Date: Fri, 7 Feb 2025 19:16:26 +0000
Subject: [PATCH 07/34] Initial commit for the Qwen 2.5 VL

---
 vllm/model_executor/models/qwen2_5_vl.py | 2 +-
 vllm/model_executor/models/registry.py   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 6aec99b3f9641..e1fd63ffeb4a7 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -1121,4 +1121,4 @@ def get_mm_mapping(self) -> MultiModelKeys:
         return MultiModelKeys.from_string_field(
             language_model="language_model",
             connector="visual.",
-            tower_model="visual.merger.")
+            tower_model="visual.merger.")
\ No newline at end of file
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 7260d973bfb28..f04867070c479 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -135,6 +135,7 @@
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
+    "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),
     # [Auto-converted (see adapters.py)]
     "Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForCausalLM"),
     # Technically PrithviGeoSpatialMAE is a model that works on images, both in

From 8c7a2b3d2947d6174f9597465239c7d2201b700d Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Sun, 9 Feb 2025 05:16:38 +0000
Subject: [PATCH 08/34] workaround to make HPU graphs work.
 disable_tensor_cache set to false.

---
 test_multimodal.py              | 34 +++++++++++++++++++++++++++++++++
 vllm/worker/hpu_model_runner.py |  4 +++-
 2 files changed, 37 insertions(+), 1 deletion(-)
 create mode 100644 test_multimodal.py

diff --git a/test_multimodal.py b/test_multimodal.py
new file mode 100644
index 0000000000000..f216e06d2ed2f
--- /dev/null
+++ b/test_multimodal.py
@@ -0,0 +1,34 @@
+from vllm import LLM
+from vllm import SamplingParams
+from vllm.assets.image import ImageAsset
+import PIL
+import multiprocessing
+
+def main():
+    # Load the image
+    image = ImageAsset("stop_sign").pil_image
+
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95 )
+    # Initialize the LLM with a multimodal model like LLaVA
+    # llava-hf/llava-1.5-7b-hf
+    # Qwen/Qwen2-VL-7B-Instruct
+    # meta-llama/Llama-3.2-11B-Vision-Instruct -> /root/sasarkar/clean_model_garden/models--meta-llama--Llama-3.2-11B-Vision-Instruct
+    llm = LLM(model="Qwen/Qwen2-VL-7B-Instruct", enforce_eager=False)
+    #llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", enforce_eager=True)
+    #llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+    #llm = LLM(model="/root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-11B-Vision-Instruct/snapshots/9eb2daaa8597bf192a8b0e73f848f3a102794df5/", tensor_parallel_size=2,)
+    # Create the prompt with image data
+    # llava prompt
+    #prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT: <|image|>"
+    #prompt = "<image>" * 576 + ("\nUSER: What is the content of this image?\nASSISTANT:")
+    # qwen2-vl prompt
+    prompt = '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'
+    outputs = llm.generate({"prompt": prompt, "multi_modal_data": {"image": image}})
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+if __name__ == "__main__":
+    multiprocessing.freeze_support()
+    main()
\ No newline at end of file
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 4dc4c0d3281a2..94e8e0c79047d 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -890,8 +890,10 @@ def _add_dummy_seq(self, seq_group_metadata_list, is_prompt):
         return seq_group_metadata_list, real_batch_size, batch_size_padded
 
     def _maybe_wrap_in_hpu_graph(self, *args, **kwargs):
+        import os
+        workaround = os.environ.get('WORKAROUND', '0') == '1'  # there is also a flag provided for disabletensorcache
         return htorch.hpu.wrap_in_hpu_graph(
-            HpuModelAdapter(*args, **kwargs), disable_tensor_cache=True
+            HpuModelAdapter(*args, **kwargs), disable_tensor_cache=not workaround # orig code its set to True
         ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(
             *args, **kwargs)
 

From 22bc3ef8900a0aff7d6906535911bd2d44e6a598 Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Mon, 10 Feb 2025 19:25:29 +0000
Subject: [PATCH 09/34] adding qwen2.5-vl to hpu + small cleanups

---
 .../model_executor/layers/rotary_embedding.py | 20 ------------------
 vllm/worker/hpu_model_runner.py               | 21 ++++++-------------
 2 files changed, 6 insertions(+), 35 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 9f0e770c40be1..6ebab8927a92b 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -849,26 +849,6 @@ def forward(
                             dim=-1)
 
         query_shape = query.shape
-        #breakpoint()
-        '''
-        in CPU:
-
-        query.shape
-        torch.Size([1451, 3584])
-
-         num_tokens
-        1451
-        (Pdb) self.head_size
-        128
-
-        on HPU:
-        query.shape
-        torch.Size([32, 1024, 3584])
-        (Pdb) num_tokens
-        1024
-        (Pdb) self.head_size
-        128
-        '''
         query = query.view(num_tokens, -1, self.head_size)
         query_rot = query[..., :self.rotary_dim]
         query_pass = query[..., self.rotary_dim:]
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 94e8e0c79047d..d43c96671b406 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -58,6 +58,7 @@
 from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
                            Logprob, SequenceData, SequenceGroupMetadata,
                            SequenceOutput)
+from vllm.transformers_utils.config import uses_mrope
 from vllm.utils import (bind_kv_cache, is_fake_hpu, is_pin_memory_available,
                         make_tensor_with_pad)
 from vllm.worker.model_runner_base import (
@@ -764,13 +765,8 @@ def _set_gc_threshold(self) -> None:
 
     @property
     def model_is_mrope(self) -> bool:
-        """Detect if the model has "mrope" rope_scaling type.
-        mrope requires keep "rope_deltas" between prompt and decoding phases."""
-        rope_scaling = getattr(self.model_config.hf_config, "rope_scaling", {})
-        breakpoint()
-        if rope_scaling is None:
-            return False
-        return rope_scaling.get("type", None) == "mrope"
+        config = self.model_config.hf_config
+        return uses_mrope(config)
 
     def load_model(self) -> None:
         import habana_frameworks.torch.core as htcore
@@ -1053,11 +1049,10 @@ def _prepare_prompt(
                     )
 
                 mrope_positions = None
-                #breakpoint()
                 if self.model_config.uses_mrope:
-                #if self.model_is_mrope:  # this returns false... rope_scaling.get("type", None) == "mrope" fails as it is "default"
                     image_grid_thw = mm_kwargs.get("image_grid_thw", None)
                     video_grid_thw = mm_kwargs.get("video_grid_thw", None)
+                    second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
                     assert image_grid_thw is not None or video_grid_thw is not None, (
                         "mrope embedding type requires multi-modal input mapper "
                         "returns 'image_grid_thw' or 'video_grid_thw'.")
@@ -1067,14 +1062,10 @@ def _prepare_prompt(
                     mrope_positions, mrope_position_delta = \
                         MRotaryEmbedding.get_input_positions(
                             token_ids,
+                            hf_config=hf_config,
                             image_grid_thw=image_grid_thw,
                             video_grid_thw=video_grid_thw,
-                            image_token_id=hf_config.image_token_id,
-                            video_token_id=hf_config.video_token_id,
-                            vision_start_token_id=hf_config.vision_start_token_id,
-                            vision_end_token_id=hf_config.vision_end_token_id,
-                            spatial_merge_size=hf_config.vision_config.
-                            spatial_merge_size,
+                            second_per_grid_ts=second_per_grid_ts,
                             context_len=context_len,
                         )
                     seq_data.mrope_position_delta = mrope_position_delta

From d4a721c749c119f96463b03d7affb12b3776e0ec Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Mon, 10 Feb 2025 20:20:34 +0000
Subject: [PATCH 10/34] removing duplicates CPU

---
 vllm/model_executor/models/qwen2_5_vl.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index e1fd63ffeb4a7..10d1ef69b82de 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -612,11 +612,18 @@ def forward(
 
         # windows attention
         window_index, cu_window_seqlens = self.get_window_index(grid_thw)
+        def remove_duplicates_cpu(a):
+            return [
+                a[i] for i in range(len(a)) if i==0 or a[i-1]!= a[i]
+            ]
+        cu_window_seqlens = remove_duplicates_cpu(cu_window_seqlens)
         cu_window_seqlens = torch.tensor(
             cu_window_seqlens,
             device=hidden_states.device,
             dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32)
-        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+        # This is not a static operation, removing duplicates earlier on CPU
+        #cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+
         seq_len, _ = hidden_states.size()
         hidden_states = hidden_states.reshape(
             seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)

From 5474d9bb89909b6df05b3ad0ec05d8fe703fe2df Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Thu, 13 Feb 2025 19:01:15 +0000
Subject: [PATCH 11/34] small changes to work with llama-3.2-vl

---
 vllm/worker/hpu_model_runner.py | 103 ++++++--------------------------
 1 file changed, 18 insertions(+), 85 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index d43c96671b406..cbd08675ade52 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -21,7 +21,6 @@
 import habana_frameworks.torch as htorch
 import habana_frameworks.torch.internal.bridge_config as bc
 import torch
-import torch.nn as nn
 import vllm_hpu_extension.environment as environment
 from vllm_hpu_extension.bucketing import HPUBucketingContext
 from vllm_hpu_extension.flags import enabled_flags
@@ -893,7 +892,7 @@ def _maybe_wrap_in_hpu_graph(self, *args, **kwargs):
         ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(
             *args, **kwargs)
 
-    def get_model(self) -> nn.Module:
+    def get_model(self) -> torch.nn.Module:
         if isinstance(self.model, HpuModelAdapter):
             return self.model.model
         return self.model
@@ -964,7 +963,6 @@ def _prepare_prompt(
         if len(seq_group_metadata_list) == 0:
             return PreparePromptMetadata.empty()
 
-        #breakpoint()
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
             seq_ids = list(seq_group_metadata.seq_data.keys())
@@ -1018,24 +1016,8 @@ def _prepare_prompt(
             # is always the first token in the sequence.
             input_positions.append(list(range(context_len, seq_len)))
 
-            '''
-            seq_group_metadata.multi_modal_data is None, so we dont enter this
-            hence multi_modal_kwargs_list isnt populated
-
-            on cpu its:
-             seq_group_metadata.multi_modal_data
-{'pixel_values': tensor([[-1.1061, -1.1061, -1.1061,  ..., -1.4518, -1.4518, -1.4518],
-        [-1.1207, -1.1207, -1.1207,  ..., -1.4376, -1.4376, -1.4376],
-        [-1.1353, -1.1353, -1.1353,  ..., -1.4376, -1.4376, -1.4376],
-        ...,
-        [ 1.1128,  0.9668,  0.8792,  ...,  0.8945,  1.1221,  1.3496],
-        [ 0.9230,  1.2004,  1.3902,  ...,  0.7950,  0.3542,  0.2973],
-        [ 0.9814,  0.9376,  1.0836,  ...,  1.2643,  1.1789,  1.1363]]), 'image_grid_thw': tensor([[ 1, 62, 92]])}
-
-            '''
             if seq_group_metadata.multi_modal_data:
                 positions = input_positions[0]
-                #breakpoint()
                 mm_data, placeholder_maps = MultiModalPlaceholderMap \
                     .from_seq_group(seq_group_metadata,
                       range(positions[0], positions[0] + len(positions)))
@@ -1049,7 +1031,7 @@ def _prepare_prompt(
                     )
 
                 mrope_positions = None
-                if self.model_config.uses_mrope:
+                if self.model_is_mrope:
                     image_grid_thw = mm_kwargs.get("image_grid_thw", None)
                     video_grid_thw = mm_kwargs.get("video_grid_thw", None)
                     second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
@@ -1069,15 +1051,9 @@ def _prepare_prompt(
                             context_len=context_len,
                         )
                     seq_data.mrope_position_delta = mrope_position_delta
-                #breakpoint()
-                '''
-                Hpu: mrope_positions 3x1024 .. 32 of the outer loop, so 1024*32 ...
-                '''
-                if mrope_positions:
-                    for idx in range(3):
-                        input_mrope_positions[idx].extend(mrope_positions[idx])
-                else:
-                    input_positions.extend(list(range(context_len, seq_len)))
+                    if mrope_positions:
+                        for idx in range(3):
+                            input_mrope_positions[idx].extend(mrope_positions[idx])
 
 
                 multi_modal_kwargs_list.append(mm_kwargs)
@@ -1176,10 +1152,6 @@ def _prepare_prompt(
                                                    dtype=torch.long,
                                                    device='cpu')
 
-        #breakpoint()
-        #input_mrope_positions : list: 3x32768
-        # max_prompt_len: max_prompt_len
-        # in CPU this is: torch.Size([3, 1451])
         input_positions = make_tensor_with_pad(input_positions or input_mrope_positions,
                                                max_len=max_prompt_len,
                                                pad=0,
@@ -1322,7 +1294,7 @@ def _prepare_decode(
                     for idx in range(3):
                         input_mrope_positions[idx].extend(next_pos[idx])
                 else:
-                    input_positions.append(position)
+                    input_positions.append([position])
 
                 seq_len = seq_len if self.sliding_window is None else min(
                     seq_len, self.sliding_window)
@@ -1345,20 +1317,6 @@ def _prepare_decode(
                 lora_index_mapping.append(lora_id)
                 lora_prompt_mapping.append(lora_id)
 
-                #sasarkar this bit isnt there in the latest cpu code. maybe subsumed by:
-                '''
-                 input_positions = torch.tensor(
-            input_data.input_positions
-            if not any(input_data.input_mrope_positions) else
-            input_data.input_mrope_positions,
-            dtype=torch.long,
-            device="cpu")
-                '''
-                if any(input_mrope_positions):
-                    input_positions = None  # type: ignore
-                else:
-                    input_mrope_positions = None  # type: ignore
-
                 if self.sliding_window is not None:
                     sliding_window_blocks = (self.sliding_window //
                                              self.block_size)
@@ -1373,6 +1331,12 @@ def _prepare_decode(
             real_batch_size = len(seq_group_metadata_list)
             input_tokens = output[:real_batch_size].clone()
 
+
+        if any(input_mrope_positions):
+            input_positions = None  # type: ignore
+        else:
+            input_mrope_positions = None  # type: ignore
+
         input_positions = torch.tensor(input_positions or input_mrope_positions,
                                        dtype=torch.long,
                                        device='cpu')
@@ -1794,36 +1758,14 @@ def create_dummy_seq_group_metadata(self,
         prompt_token_ids_array = array('l', prompt_token_ids)  # noqa: F821
         seq_data = SequenceData(prompt_token_ids_array)
         seq_data.output_token_ids = output_token_ids
-        # sasarkar, unify if-else later
-        if self.model_config.uses_mrope:
-            # sasarkar: hard coded img shape. what should it be in general?
-            multi_modal_data_dummy = MultiModalKwargs({'pixel_values': torch.rand([5704, 1176]), 'image_grid_thw': torch.tensor([[ 1, 62, 92]])})
-            x = SequenceGroupMetadata(request_id=str(group_id),
-                                        is_prompt=(output_len == 0),
-                                        seq_data={group_id: seq_data},
-                                        sampling_params=sampling_params,
-                                        block_tables=block_tables,
-                                        lora_request=lora_request,
-                                        multi_modal_data=multi_modal_data_dummy,
-                                        mm_processor_kwargs={},
-                                        multi_modal_placeholders={'image': [{'offset': 15, 'length': 1426}]}) # sasarkar.. remove hardcoded nums
-        else:
-            x = SequenceGroupMetadata(request_id=str(group_id),
-                                        is_prompt=(output_len == 0),
-                                        seq_data={group_id: seq_data},
-                                        sampling_params=sampling_params,
-                                        block_tables=block_tables,
-                                        lora_request=lora_request)
-        #breakpoint()
-        '''
-        x.multi_modal_data is empty.... 
-        we need to pass in some dummy here.
-        I think llama3.2VL is working, how is it working if this is empty?.. need to track llama3.2vl status
-        '''
-        return x
+        return SequenceGroupMetadata(request_id=str(group_id),
+                                     is_prompt=(output_len == 0),
+                                     seq_data={group_id: seq_data},
+                                     sampling_params=sampling_params,
+                                     block_tables=block_tables,
+                                     lora_request=lora_request)
 
     def profile_run(self) -> None:
-        return
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
         bind_kv_cache(
@@ -1857,7 +1799,6 @@ def warmup_scenario(self,
         # passed in, which contains a lora from the lora warmup path.
         dummy_lora_requests: List[LoRARequest] = []
         dummy_lora_requests_per_seq: List[LoRARequest] = []
-        #breakpoint()
         if self.lora_config and is_lora_profile_run:
             assert self.lora_manager is not None
             with self.lora_manager.dummy_lora_cache():
@@ -1877,7 +1818,6 @@ def warmup_scenario(self,
                 ]
         self.profiler.start('internal', scenario_name)
         times = 3 if use_graphs or is_pt_profiler_run else 1
-        #breakpoint()
         if is_prompt:
             seqs = [
                 self.create_dummy_seq_group_metadata(
@@ -1908,10 +1848,6 @@ def warmup_scenario(self,
             profiler.start()
         for _ in range(times):
             inputs = self.prepare_model_input(seqs)
-            '''
-            sasarkar: at this point inputs.multi_modal_kwargs.keys() is empty.. thats not good
-            '''
-            #breakpoint()
             is_single_step = \
                 self.vllm_config.scheduler_config.num_scheduler_steps == 1
             if is_prompt or is_single_step:
@@ -2376,9 +2312,6 @@ def prepare_model_input(
                     seq_group_metadata_list=seq_group_metadata_list)
             model_input, sampling_metadata = self.prepare_input_tensors(
                 seq_group_metadata_list)
-            '''
-            sasarkar: model_input.multi_modal_kwargs empty here.. not good
-            '''
             assert model_input.attn_metadata is not None
             is_prompt = model_input.attn_metadata.is_prompt
 

From 008fbb53ad1b1b187e7d23eff7ebc58595b76716 Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Sun, 16 Feb 2025 02:18:09 +0000
Subject: [PATCH 12/34] skip profile_run for now

---
 vllm/worker/hpu_model_runner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index cbd08675ade52..8c2a1b0804f3c 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1766,6 +1766,7 @@ def create_dummy_seq_group_metadata(self,
                                      lora_request=lora_request)
 
     def profile_run(self) -> None:
+        return
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
         bind_kv_cache(

From f48d6fc84f8a02d0d457e95c8c4285ed7a4d6d17 Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Tue, 18 Feb 2025 20:18:38 +0000
Subject: [PATCH 13/34] reshape positions in MRotaryEmbedding for HPU

---
 vllm/model_executor/layers/rotary_embedding.py | 5 +++++
 vllm/worker/hpu_model_runner.py                | 1 -
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 6ebab8927a92b..1468c8ec30182 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -830,6 +830,11 @@ def forward(
             key: [num_tokens, num_kv_heads * head_size]
         """
         assert positions.ndim == 1 or positions.ndim == 2
+        if positions.ndim == 2 and positions.shape[0] != 3:
+            # HPU positions are [batch_size, num_tokens]
+            # if they are not [3, num_tokens], we will
+            # reshape it to be [num_tokens, ]
+            positions = positions.view(-1)
 
         num_tokens = positions.shape[-1]
         cos_sin = self.cos_sin_cache[positions]
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 8c2a1b0804f3c..cbd08675ade52 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1766,7 +1766,6 @@ def create_dummy_seq_group_metadata(self,
                                      lora_request=lora_request)
 
     def profile_run(self) -> None:
-        return
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
         bind_kv_cache(

From 4caf3834fde89da3843f720b16215b72108cb2e3 Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Fri, 21 Feb 2025 09:31:58 +0000
Subject: [PATCH 14/34] input positions [3, seq_len] or [seq_len,] for
 Qwen2.5vl

---
 .../model_executor/layers/rotary_embedding.py |   7 +-
 vllm/worker/hpu_model_runner.py               | 114 ++++++++++++------
 2 files changed, 78 insertions(+), 43 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 1468c8ec30182..d374d880efa09 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -830,12 +830,7 @@ def forward(
             key: [num_tokens, num_kv_heads * head_size]
         """
         assert positions.ndim == 1 or positions.ndim == 2
-        if positions.ndim == 2 and positions.shape[0] != 3:
-            # HPU positions are [batch_size, num_tokens]
-            # if they are not [3, num_tokens], we will
-            # reshape it to be [num_tokens, ]
-            positions = positions.view(-1)
-
+        # print(f"positions {positions.shape} query {query.shape} key {key.shape}")
         num_tokens = positions.shape[-1]
         cos_sin = self.cos_sin_cache[positions]
         cos, sin = cos_sin.chunk(2, dim=-1)
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index cbd08675ade52..c0ba9243fc088 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -422,11 +422,9 @@ def _prepare_cos_sin(self, positions):
             current_module.prepare_cos_sin(
                 positions, recompute_cos_sin=self.recompute_cos_sin)
         else:
-            pass
-            # dont raise error for qwen2.5-vl
-            #raise AttributeError(
-            #    "The module at the end of the path does not have \
-            #    a 'prepare_cos_sin' method.")
+            raise AttributeError(
+               "The module at the end of the path does not have \
+               a 'prepare_cos_sin' method.")
 
     def forward(self, *args, **kwargs):
         kwargs = kwargs.copy()
@@ -440,9 +438,10 @@ def forward(self, *args, **kwargs):
         kwargs['attn_metadata'] = self._update_metadata(
             kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1),
             input_ids.device, self.dtype)
-        if 'lora_mask' in kwargs:
-            LoraMask.setLoraMask(kwargs.pop('lora_mask'))
-        if self.layer_names is not None:
+        LoraMask.setLoraMask(kwargs.pop('lora_mask'))
+        model_config = getattr(self.model, "config", None)
+        model_is_mrope = uses_mrope(model_config)
+        if self.layer_names is not None and not model_is_mrope:
             self._prepare_cos_sin(kwargs['positions'])
 
         with set_forward_context(kwargs['attn_metadata'], self.vllm_config,
@@ -945,7 +944,7 @@ def _prepare_prompt(
     ) -> PreparePromptMetadata:
         input_tokens: List[List[int]] = []
         input_positions: List[List[int]] = []
-        input_mrope_positions: List[List[int]] = [[] for _ in range(3)]
+        input_mrope_positions: List[List[int]] = []
         slot_mapping: List[List[int]] = []
         lora_index_mapping: List[List[int]] = []
         lora_prompt_mapping: List[List[int]] = []
@@ -1016,6 +1015,7 @@ def _prepare_prompt(
             # is always the first token in the sequence.
             input_positions.append(list(range(context_len, seq_len)))
 
+            batch_input_mrope_positions = None
             if seq_group_metadata.multi_modal_data:
                 positions = input_positions[0]
                 mm_data, placeholder_maps = MultiModalPlaceholderMap \
@@ -1030,7 +1030,6 @@ def _prepare_prompt(
                         seq_group_metadata.mm_processor_kwargs,
                     )
 
-                mrope_positions = None
                 if self.model_is_mrope:
                     image_grid_thw = mm_kwargs.get("image_grid_thw", None)
                     video_grid_thw = mm_kwargs.get("video_grid_thw", None)
@@ -1050,11 +1049,11 @@ def _prepare_prompt(
                             second_per_grid_ts=second_per_grid_ts,
                             context_len=context_len,
                         )
+                    assert mrope_positions is not None
                     seq_data.mrope_position_delta = mrope_position_delta
-                    if mrope_positions:
-                        for idx in range(3):
-                            input_mrope_positions[idx].extend(mrope_positions[idx])
-
+                    batch_input_mrope_positions = [[] for _ in range(3)]
+                    for idx in range(3):
+                        batch_input_mrope_positions[idx].extend(mrope_positions[idx])
 
                 multi_modal_kwargs_list.append(mm_kwargs)
 
@@ -1062,6 +1061,8 @@ def _prepare_prompt(
                     multi_modal_placeholder_maps[modality].extend(
                         placeholder_map)
 
+            input_mrope_positions.append(batch_input_mrope_positions)
+
             if seq_group_metadata.block_tables is None:
                 # During memory profiling, the block tables are not initialized
                 # yet. In this case, we just use a dummy slot mapping.
@@ -1095,11 +1096,6 @@ def _prepare_prompt(
                     slot = block_number * self.block_size + block_offset
                     slot_mapping[-1].append(slot)
 
-        if any(input_mrope_positions):
-            input_positions = None  # type: ignore
-        else:
-            input_mrope_positions = None  # type: ignore
-
         max_query_len = max(query_lens)
         real_num_seqs = len(query_lens)
 
@@ -1152,11 +1148,40 @@ def _prepare_prompt(
                                                    dtype=torch.long,
                                                    device='cpu')
 
-        input_positions = make_tensor_with_pad(input_positions or input_mrope_positions,
-                                               max_len=max_prompt_len,
-                                               pad=0,
-                                               dtype=torch.long,
-                                               device='cpu')
+        mrope_input_positions: Optional[List[List[int]]] = None
+        if any(mrope_position is not None
+               for mrope_position in input_mrope_positions):
+            assert self.model_is_mrope
+            mrope_input_positions = [[] for _ in range(3)]
+            for idx in range(3):
+                for b_idx, input_mrope_position in enumerate(input_mrope_positions):
+                    if input_mrope_position is None:
+                        positions = input_positions[b_idx]
+                    else:
+                        positions = input_mrope_position[idx]
+                    # print(f"positions {len(positions)}")
+                    padded_positions = make_tensor_with_pad([positions],
+                                                        max_len=max_prompt_len,
+                                                        pad=0,
+                                                        dtype=torch.long,
+                                                        device='cpu').flatten().tolist()
+                    mrope_input_positions[idx].extend(padded_positions)
+            input_positions = None  # type: ignore
+            input_positions_tensor = torch.tensor(mrope_input_positions,
+                                                  dtype=torch.long,
+                                                  device='cpu',
+                                                  )
+        else:
+            input_mrope_positions = None  # type: ignore
+            input_positions_tensor = make_tensor_with_pad(input_positions,
+                                                max_len=max_prompt_len,
+                                                pad=0,
+                                                dtype=torch.long,
+                                                device='cpu')
+            if self.model_is_mrope:
+                # Qwen 2.5 vl works with flatten input_positions
+                input_positions_tensor = input_positions_tensor.flatten()
+
 
         slot_mapping = make_tensor_with_pad(slot_mapping,
                                             max_len=max_prompt_len,
@@ -1185,7 +1210,7 @@ def _prepare_prompt(
                 self.device, non_blocking=True)
         input_tokens_tensor = input_tokens_tensor.to(  # type: ignore
             self.device, non_blocking=True)
-        input_positions = input_positions.to(  # type: ignore
+        input_positions_tensor = input_positions_tensor.to(  # type: ignore
             self.device, non_blocking=True)
         slot_mapping = slot_mapping.to(  # type: ignore
             self.device, non_blocking=True)
@@ -1220,7 +1245,7 @@ def _prepare_prompt(
                     self.device, non_blocking=True)
 
         return PreparePromptMetadata(input_tokens=input_tokens_tensor,
-                                     input_positions=input_positions,
+                                     input_positions=input_positions_tensor,
                                      attn_metadata=attn_metadata,
                                      seq_lens=seq_lens,
                                      query_lens=query_lens,
@@ -1284,17 +1309,21 @@ def _prepare_decode(
 
                 seq_len = seq_data.get_len()
                 position = seq_len - 1
-                if seq_data.mrope_position_delta is not None:
-                    context_len = seq_data.get_num_computed_tokens()
-                    next_pos = MRotaryEmbedding.get_next_input_positions(
-                        seq_data.mrope_position_delta,
-                        context_len,
-                        seq_len,
-                    )
-                    for idx in range(3):
-                        input_mrope_positions[idx].extend(next_pos[idx])
-                else:
-                    input_positions.append([position])
+                # FIXME: Why do we need to change the decode? 
+                # I didn't find a similar example on the GPU code
+                # only on the CPU 
+                #
+                # if seq_data.mrope_position_delta is not None:
+                #     context_len = seq_data.get_num_computed_tokens()
+                #     next_pos = MRotaryEmbedding.get_next_input_positions(
+                #         seq_data.mrope_position_delta,
+                #         context_len,
+                #         seq_len,
+                #     )
+                #     for idx in range(3):
+                #         input_mrope_positions[idx].extend(next_pos[idx])
+                # else:
+                input_positions.append([position])
 
                 seq_len = seq_len if self.sliding_window is None else min(
                     seq_len, self.sliding_window)
@@ -1341,6 +1370,10 @@ def _prepare_decode(
                                        dtype=torch.long,
                                        device='cpu')
 
+        if self.model_is_mrope:
+            # Qwen 2.5 vl works with flatten input_positions
+            input_positions = input_positions.flatten()
+
         num_decode_tokens = len(seq_lens)
 
         last_block_usage = [
@@ -1745,6 +1778,13 @@ def create_dummy_seq_group_metadata(self,
             sampling_params = SamplingParams(temperature=temperature)
             num_blocks = math.ceil(seq_len / self.block_size)
         seq_len = max(seq_len, 1)
+        # TODO: Add dummy data with metadata info
+        # encoder_dummy_data \
+        #     = self.input_registry.dummy_data_for_profiling(
+        #         self.model_config,
+        #                                 seq_len,
+        #                                 self.mm_registry,
+        #                                 is_encoder_data=True)        
         if is_prompt:
             input_len = seq_len
             output_len = 0

From 998d0902ff3ede5cd18d580874174d03b45f759a Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Mon, 24 Feb 2025 18:55:01 +0000
Subject: [PATCH 15/34] fix the decoder

---
 .../model_executor/layers/rotary_embedding.py |  2 +-
 vllm/model_executor/models/qwen2_5_vl.py      |  2 +-
 vllm/worker/hpu_model_runner.py               | 42 +++++++++----------
 3 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index d374d880efa09..2d0d7c22b9fc7 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -830,7 +830,7 @@ def forward(
             key: [num_tokens, num_kv_heads * head_size]
         """
         assert positions.ndim == 1 or positions.ndim == 2
-        # print(f"positions {positions.shape} query {query.shape} key {key.shape}")
+        print(f" rotary_emd positions {positions.shape} query {query.shape} key {key.shape}")
         num_tokens = positions.shape[-1]
         cos_sin = self.cos_sin_cache[positions]
         cos, sin = cos_sin.chunk(2, dim=-1)
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 10d1ef69b82de..669b7e759fa59 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -1066,7 +1066,7 @@ def forward(
                 in seconds) for each grid along the temporal dimension in the
                 3D position IDs. `None` if no videos are passed.
         """
-
+        print(f"> qwen2_5_vl.py: input_ids {input_ids.shape} positions {positions.shape}")
         if intermediate_tensors is not None:
             inputs_embeds = None
 
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index c0ba9243fc088..555b01ad2befe 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1011,6 +1011,7 @@ def _prepare_prompt(
             context_lens.append(context_len)
             query_lens.append(seq_len - context_len)
             input_tokens.append(prompt_tokens)
+            print("tokens", prompt_tokens)
             # NOTE(woosuk): Here we assume that the first token in the prompt
             # is always the first token in the sequence.
             input_positions.append(list(range(context_len, seq_len)))
@@ -1159,7 +1160,6 @@ def _prepare_prompt(
                         positions = input_positions[b_idx]
                     else:
                         positions = input_mrope_position[idx]
-                    # print(f"positions {len(positions)}")
                     padded_positions = make_tensor_with_pad([positions],
                                                         max_len=max_prompt_len,
                                                         pad=0,
@@ -1171,6 +1171,7 @@ def _prepare_prompt(
                                                   dtype=torch.long,
                                                   device='cpu',
                                                   )
+            print(f" ABC: input positions with MROPE shape is {input_positions_tensor.shape}")
         else:
             input_mrope_positions = None  # type: ignore
             input_positions_tensor = make_tensor_with_pad(input_positions,
@@ -1181,6 +1182,7 @@ def _prepare_prompt(
             if self.model_is_mrope:
                 # Qwen 2.5 vl works with flatten input_positions
                 input_positions_tensor = input_positions_tensor.flatten()
+            print(f" ABC: input positions no mrope shape is {input_positions_tensor.shape}")
 
 
         slot_mapping = make_tensor_with_pad(slot_mapping,
@@ -1309,22 +1311,22 @@ def _prepare_decode(
 
                 seq_len = seq_data.get_len()
                 position = seq_len - 1
-                # FIXME: Why do we need to change the decode? 
-                # I didn't find a similar example on the GPU code
-                # only on the CPU 
-                #
-                # if seq_data.mrope_position_delta is not None:
-                #     context_len = seq_data.get_num_computed_tokens()
-                #     next_pos = MRotaryEmbedding.get_next_input_positions(
-                #         seq_data.mrope_position_delta,
-                #         context_len,
-                #         seq_len,
-                #     )
-                #     for idx in range(3):
-                #         input_mrope_positions[idx].extend(next_pos[idx])
-                # else:
+
                 input_positions.append([position])
 
+                if seq_data.mrope_position_delta is not None:
+                    context_len = seq_data.get_num_computed_tokens()
+                    pos_for_mrope = MRotaryEmbedding.get_next_input_positions(
+                        seq_data.mrope_position_delta,
+                        context_len,
+                        seq_len,
+                    )
+                else:
+                    pos_for_mrope = [[position]] * 3
+
+                for idx in range(3):
+                    input_mrope_positions[idx].extend(pos_for_mrope[idx])
+
                 seq_len = seq_len if self.sliding_window is None else min(
                     seq_len, self.sliding_window)
                 seq_lens.append(seq_len)
@@ -1361,19 +1363,15 @@ def _prepare_decode(
             input_tokens = output[:real_batch_size].clone()
 
 
-        if any(input_mrope_positions):
-            input_positions = None  # type: ignore
+        if self.model_is_mrope:
+            input_positions = None
         else:
-            input_mrope_positions = None  # type: ignore
+            input_mrope_positions = None
 
         input_positions = torch.tensor(input_positions or input_mrope_positions,
                                        dtype=torch.long,
                                        device='cpu')
 
-        if self.model_is_mrope:
-            # Qwen 2.5 vl works with flatten input_positions
-            input_positions = input_positions.flatten()
-
         num_decode_tokens = len(seq_lens)
 
         last_block_usage = [

From cd1bbe090474569fbf4e2f17da878296637178a8 Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Mon, 24 Feb 2025 18:56:52 +0000
Subject: [PATCH 16/34] comment prints

---
 vllm/model_executor/layers/rotary_embedding.py | 2 +-
 vllm/worker/hpu_model_runner.py                | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 2d0d7c22b9fc7..ccecdcfbb1a39 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -830,7 +830,7 @@ def forward(
             key: [num_tokens, num_kv_heads * head_size]
         """
         assert positions.ndim == 1 or positions.ndim == 2
-        print(f" rotary_emd positions {positions.shape} query {query.shape} key {key.shape}")
+        # print(f" rotary_emd positions {positions.shape} query {query.shape} key {key.shape}")
         num_tokens = positions.shape[-1]
         cos_sin = self.cos_sin_cache[positions]
         cos, sin = cos_sin.chunk(2, dim=-1)
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 555b01ad2befe..27924da5ecec1 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1011,7 +1011,7 @@ def _prepare_prompt(
             context_lens.append(context_len)
             query_lens.append(seq_len - context_len)
             input_tokens.append(prompt_tokens)
-            print("tokens", prompt_tokens)
+            # print("tokens", prompt_tokens)
             # NOTE(woosuk): Here we assume that the first token in the prompt
             # is always the first token in the sequence.
             input_positions.append(list(range(context_len, seq_len)))
@@ -1171,7 +1171,7 @@ def _prepare_prompt(
                                                   dtype=torch.long,
                                                   device='cpu',
                                                   )
-            print(f" ABC: input positions with MROPE shape is {input_positions_tensor.shape}")
+            # print(f" ABC: input positions with MROPE shape is {input_positions_tensor.shape}")
         else:
             input_mrope_positions = None  # type: ignore
             input_positions_tensor = make_tensor_with_pad(input_positions,
@@ -1182,7 +1182,7 @@ def _prepare_prompt(
             if self.model_is_mrope:
                 # Qwen 2.5 vl works with flatten input_positions
                 input_positions_tensor = input_positions_tensor.flatten()
-            print(f" ABC: input positions no mrope shape is {input_positions_tensor.shape}")
+            # print(f" ABC: input positions no mrope shape is {input_positions_tensor.shape}")
 
 
         slot_mapping = make_tensor_with_pad(slot_mapping,

From 99f8e9f4cf2213f3fadf136b30e5ab768f9af2e1 Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Wed, 26 Feb 2025 06:46:14 +0000
Subject: [PATCH 17/34] cleanup

---
 test_multimodal.py                            | 34 -------------------
 .../model_executor/layers/rotary_embedding.py |  2 +-
 vllm/model_executor/models/qwen2_5_vl.py      |  3 +-
 vllm/model_executor/models/qwen2_vl.py        | 31 +++--------------
 vllm/model_executor/models/registry.py        |  1 -
 5 files changed, 7 insertions(+), 64 deletions(-)
 delete mode 100644 test_multimodal.py

diff --git a/test_multimodal.py b/test_multimodal.py
deleted file mode 100644
index f216e06d2ed2f..0000000000000
--- a/test_multimodal.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from vllm import LLM
-from vllm import SamplingParams
-from vllm.assets.image import ImageAsset
-import PIL
-import multiprocessing
-
-def main():
-    # Load the image
-    image = ImageAsset("stop_sign").pil_image
-
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95 )
-    # Initialize the LLM with a multimodal model like LLaVA
-    # llava-hf/llava-1.5-7b-hf
-    # Qwen/Qwen2-VL-7B-Instruct
-    # meta-llama/Llama-3.2-11B-Vision-Instruct -> /root/sasarkar/clean_model_garden/models--meta-llama--Llama-3.2-11B-Vision-Instruct
-    llm = LLM(model="Qwen/Qwen2-VL-7B-Instruct", enforce_eager=False)
-    #llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", enforce_eager=True)
-    #llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-    #llm = LLM(model="/root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-11B-Vision-Instruct/snapshots/9eb2daaa8597bf192a8b0e73f848f3a102794df5/", tensor_parallel_size=2,)
-    # Create the prompt with image data
-    # llava prompt
-    #prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT: <|image|>"
-    #prompt = "<image>" * 576 + ("\nUSER: What is the content of this image?\nASSISTANT:")
-    # qwen2-vl prompt
-    prompt = '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'
-    outputs = llm.generate({"prompt": prompt, "multi_modal_data": {"image": image}})
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-if __name__ == "__main__":
-    multiprocessing.freeze_support()
-    main()
\ No newline at end of file
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index ccecdcfbb1a39..6ebab8927a92b 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -830,7 +830,7 @@ def forward(
             key: [num_tokens, num_kv_heads * head_size]
         """
         assert positions.ndim == 1 or positions.ndim == 2
-        # print(f" rotary_emd positions {positions.shape} query {query.shape} key {key.shape}")
+
         num_tokens = positions.shape[-1]
         cos_sin = self.cos_sin_cache[positions]
         cos, sin = cos_sin.chunk(2, dim=-1)
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 669b7e759fa59..714f2a73a20ff 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -1066,7 +1066,6 @@ def forward(
                 in seconds) for each grid along the temporal dimension in the
                 3D position IDs. `None` if no videos are passed.
         """
-        print(f"> qwen2_5_vl.py: input_ids {input_ids.shape} positions {positions.shape}")
         if intermediate_tensors is not None:
             inputs_embeds = None
 
@@ -1128,4 +1127,4 @@ def get_mm_mapping(self) -> MultiModelKeys:
         return MultiModelKeys.from_string_field(
             language_model="language_model",
             connector="visual.",
-            tower_model="visual.merger.")
\ No newline at end of file
+            tower_model="visual.merger.")
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 786ab2177e59d..961f53cef1379 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -580,7 +580,7 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
             hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
             wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
             hpos_ids = hpos_ids.reshape(
-                h // self.spatial_merge_size,  # buggy... 62/2 is yielding 0 .. seems its ok wo hpu graphs
+                h // self.spatial_merge_size,
                 self.spatial_merge_size,
                 w // self.spatial_merge_size,
                 self.spatial_merge_size,
@@ -1112,14 +1112,6 @@ def _validate_and_reshape_mm_tensor(self, mm_input: object,
                 raise ValueError(f"{name} should be 2D or batched 3D tensor. "
                                  f"Got ndim: {mm_input.ndim} "
                                  f"(shape={mm_input.shape})")
-            # sasarkar buggy  ... seems an issue with hpu graph?
-            '''
-            (Pdb) (list(mm_input))
-[tensor([[ 1, 62, 92]], device='hpu:0')]
-(Pdb) torch.concat(list(mm_input))
-tensor([[0, 0, 0]], device='hpu:0')
-
-            '''
             return torch.concat(list(mm_input))
         else:
             return torch.concat(mm_input)
@@ -1134,13 +1126,10 @@ def _parse_and_validate_image_input(
             return None
 
         if pixel_values is not None:
-            # sasarkar: _validate_and_reshape_mm_tensor seems to be messing up the values some how
-            #pixel_values = self._validate_and_reshape_mm_tensor(
-            #    pixel_values, "image pixel values")
-            #image_grid_thw = self._validate_and_reshape_mm_tensor(
-            #    image_grid_thw, "image grid_thw")
-            pixel_values = pixel_values.view(-1, pixel_values.shape[-1])
-            image_grid_thw = image_grid_thw.view(-1, image_grid_thw.shape[-1])
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, "image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
 
             if not isinstance(pixel_values, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image pixel values. "
@@ -1151,7 +1140,6 @@ def _parse_and_validate_image_input(
                                            image_grid_thw=image_grid_thw)
 
         if image_embeds is not None:
-            assert False, "Call me if this is hit"
             image_embeds = self._validate_and_reshape_mm_tensor(
                 image_embeds, "image embeds")
             image_grid_thw = self._validate_and_reshape_mm_tensor(
@@ -1369,15 +1357,6 @@ def forward(
                     video_input=video_input)
                 input_ids = None
 
-        '''
-        During "warmup": ... have switched off warmup memory for now
-        hpu:
-        input_ids, positions: 32x1024
-
-        CPU:
-        input_ids is None
-        positions: 3x1451:
-        '''
         hidden_states = self.language_model.model(
             input_ids=input_ids,
             positions=positions,
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index f04867070c479..7260d973bfb28 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -135,7 +135,6 @@
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
-    "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),
     # [Auto-converted (see adapters.py)]
     "Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForCausalLM"),
     # Technically PrithviGeoSpatialMAE is a model that works on images, both in

From 9eac068909b06df739d28df98b34e98ee85caf8c Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Wed, 26 Feb 2025 17:00:30 +0000
Subject: [PATCH 18/34] polishing

---
 vllm/model_executor/models/qwen2_5_vl.py |   1 +
 vllm/worker/hpu_model_runner.py          | 161 ++++++++++++-----------
 2 files changed, 85 insertions(+), 77 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 714f2a73a20ff..2c7f0807c828b 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -1066,6 +1066,7 @@ def forward(
                 in seconds) for each grid along the temporal dimension in the
                 3D position IDs. `None` if no videos are passed.
         """
+
         if intermediate_tensors is not None:
             inputs_embeds = None
 
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 27924da5ecec1..8273933c3926f 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -169,6 +169,9 @@ def get_target_layer_suffix_list(model_type) -> list[str]:
         decoder_layer_table.get(model_type, "DecoderLayer"), "EncoderLayer"
     ]
 
+def get_hpu_disable_tensor_cache():
+    env_var = os.environ.get('HPU_DISABLE_TENSOR_CACHE', 'true')
+    return env_var.lower() == 'true'
 
 def modify_model_layers(module: torch.nn.Module,
                         suffix_list: list[str],
@@ -227,6 +230,27 @@ def find_rope_layer(parent, path):
     # Return the result if found, otherwise None
     return path_to_rope
 
+def build_and_pad_mrope_positions(input_positions: List[List[int]],
+                                  input_mrope_positions: List[List[List[int]]],
+                                  max_prompt_len) -> Optional[List[List[int]]]:
+    # Qwen2.5vl expects 3 lists of positions, we are going to pad each
+    # seq_data in the list using either MRope values for multi-modal
+    # or regular position for text only inputs
+    mrope_input_positions = [[] for _ in range(3)]
+    for idx in range(3):
+        for b_idx, input_mrope_position in enumerate(input_mrope_positions):
+            if input_mrope_position is not None:
+                positions = input_mrope_position[idx]
+            else:
+                # use regular positions as default
+                positions = input_positions[b_idx]  
+            padded_positions = make_tensor_with_pad([positions],
+                                                max_len=max_prompt_len,
+                                                pad=0,
+                                                dtype=torch.long,
+                                                device='cpu').flatten().tolist()
+            mrope_input_positions[idx].extend(padded_positions)
+    return mrope_input_positions
 
 class HpuModelAdapter:
 
@@ -438,7 +462,8 @@ def forward(self, *args, **kwargs):
         kwargs['attn_metadata'] = self._update_metadata(
             kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1),
             input_ids.device, self.dtype)
-        LoraMask.setLoraMask(kwargs.pop('lora_mask'))
+        if 'lora_mask' in kwargs:
+            LoraMask.setLoraMask(kwargs.pop('lora_mask'))
         model_config = getattr(self.model, "config", None)
         model_is_mrope = uses_mrope(model_config)
         if self.layer_names is not None and not model_is_mrope:
@@ -884,10 +909,10 @@ def _add_dummy_seq(self, seq_group_metadata_list, is_prompt):
         return seq_group_metadata_list, real_batch_size, batch_size_padded
 
     def _maybe_wrap_in_hpu_graph(self, *args, **kwargs):
-        import os
-        workaround = os.environ.get('WORKAROUND', '0') == '1'  # there is also a flag provided for disabletensorcache
+        disable_tensor_cache = get_hpu_disable_tensor_cache()
         return htorch.hpu.wrap_in_hpu_graph(
-            HpuModelAdapter(*args, **kwargs), disable_tensor_cache=not workaround # orig code its set to True
+            HpuModelAdapter(*args, **kwargs),
+            disable_tensor_cache=disable_tensor_cache,
         ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(
             *args, **kwargs)
 
@@ -938,13 +963,34 @@ def _check_config(self, batch_size, seq_len, attn_metadata, warmup_mode):
                 "Configuration: (%s, %s, %s, %s) was not warmed-up!", phase,
                 batch_size, seq_len, num_blocks)
 
+    def _get_mrope_positions_and_delta(self, seq_data, mm_kwargs, context_len):
+        image_grid_thw = mm_kwargs.get("image_grid_thw", None)
+        video_grid_thw = mm_kwargs.get("video_grid_thw", None)
+        second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
+        assert image_grid_thw is not None or video_grid_thw is not None, (
+            "mrope embedding type requires multi-modal input mapper "
+            "returns 'image_grid_thw' or 'video_grid_thw'.")
+        hf_config = self.model_config.hf_config
+        token_ids = seq_data.get_token_ids()
+        mrope_positions, mrope_position_delta = \
+            MRotaryEmbedding.get_input_positions(
+                token_ids,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                context_len=context_len,
+            )
+        assert mrope_positions is not None
+        return mrope_positions, mrope_position_delta
+
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> PreparePromptMetadata:
         input_tokens: List[List[int]] = []
         input_positions: List[List[int]] = []
-        input_mrope_positions: List[List[int]] = []
+        input_mrope_positions: List[List[List[int]]] = []
         slot_mapping: List[List[int]] = []
         lora_index_mapping: List[List[int]] = []
         lora_prompt_mapping: List[List[int]] = []
@@ -1011,12 +1057,11 @@ def _prepare_prompt(
             context_lens.append(context_len)
             query_lens.append(seq_len - context_len)
             input_tokens.append(prompt_tokens)
-            # print("tokens", prompt_tokens)
             # NOTE(woosuk): Here we assume that the first token in the prompt
             # is always the first token in the sequence.
             input_positions.append(list(range(context_len, seq_len)))
 
-            batch_input_mrope_positions = None
+            seq_data_mrope_positions = None
             if seq_group_metadata.multi_modal_data:
                 positions = input_positions[0]
                 mm_data, placeholder_maps = MultiModalPlaceholderMap \
@@ -1031,30 +1076,19 @@ def _prepare_prompt(
                         seq_group_metadata.mm_processor_kwargs,
                     )
 
+                # special processing for mrope position deltas.
                 if self.model_is_mrope:
-                    image_grid_thw = mm_kwargs.get("image_grid_thw", None)
-                    video_grid_thw = mm_kwargs.get("video_grid_thw", None)
-                    second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
-                    assert image_grid_thw is not None or video_grid_thw is not None, (
-                        "mrope embedding type requires multi-modal input mapper "
-                        "returns 'image_grid_thw' or 'video_grid_thw'.")
-
-                    hf_config = self.model_config.hf_config
-                    token_ids = seq_data.get_token_ids()
                     mrope_positions, mrope_position_delta = \
-                        MRotaryEmbedding.get_input_positions(
-                            token_ids,
-                            hf_config=hf_config,
-                            image_grid_thw=image_grid_thw,
-                            video_grid_thw=video_grid_thw,
-                            second_per_grid_ts=second_per_grid_ts,
-                            context_len=context_len,
-                        )
+                        self._get_mrope_positions_and_delta(
+                            seq_data=seq_data,
+                            mm_kwargs=mm_kwargs,
+                            context_len=context_len)
                     assert mrope_positions is not None
                     seq_data.mrope_position_delta = mrope_position_delta
-                    batch_input_mrope_positions = [[] for _ in range(3)]
+                    seq_data_mrope_positions = [[] for _ in range(3)]
                     for idx in range(3):
-                        batch_input_mrope_positions[idx].extend(mrope_positions[idx])
+                        seq_data_mrope_positions[idx] \
+                            .extend(mrope_positions[idx])
 
                 multi_modal_kwargs_list.append(mm_kwargs)
 
@@ -1062,7 +1096,7 @@ def _prepare_prompt(
                     multi_modal_placeholder_maps[modality].extend(
                         placeholder_map)
 
-            input_mrope_positions.append(batch_input_mrope_positions)
+            input_mrope_positions.append(seq_data_mrope_positions)
 
             if seq_group_metadata.block_tables is None:
                 # During memory profiling, the block tables are not initialized
@@ -1149,29 +1183,15 @@ def _prepare_prompt(
                                                    dtype=torch.long,
                                                    device='cpu')
 
-        mrope_input_positions: Optional[List[List[int]]] = None
-        if any(mrope_position is not None
-               for mrope_position in input_mrope_positions):
-            assert self.model_is_mrope
-            mrope_input_positions = [[] for _ in range(3)]
-            for idx in range(3):
-                for b_idx, input_mrope_position in enumerate(input_mrope_positions):
-                    if input_mrope_position is None:
-                        positions = input_positions[b_idx]
-                    else:
-                        positions = input_mrope_position[idx]
-                    padded_positions = make_tensor_with_pad([positions],
-                                                        max_len=max_prompt_len,
-                                                        pad=0,
-                                                        dtype=torch.long,
-                                                        device='cpu').flatten().tolist()
-                    mrope_input_positions[idx].extend(padded_positions)
+        if self.model_is_mrope:
+            padded_input_mrope_positions = \
+                build_and_pad_mrope_positions(input_positions=input_positions,
+                                              input_mrope_positions=input_mrope_positions,
+                                              max_prompt_len=max_prompt_len)
             input_positions = None  # type: ignore
-            input_positions_tensor = torch.tensor(mrope_input_positions,
+            input_positions_tensor = torch.tensor(padded_input_mrope_positions,
                                                   dtype=torch.long,
-                                                  device='cpu',
-                                                  )
-            # print(f" ABC: input positions with MROPE shape is {input_positions_tensor.shape}")
+                                                  device='cpu')
         else:
             input_mrope_positions = None  # type: ignore
             input_positions_tensor = make_tensor_with_pad(input_positions,
@@ -1179,11 +1199,6 @@ def _prepare_prompt(
                                                 pad=0,
                                                 dtype=torch.long,
                                                 device='cpu')
-            if self.model_is_mrope:
-                # Qwen 2.5 vl works with flatten input_positions
-                input_positions_tensor = input_positions_tensor.flatten()
-            # print(f" ABC: input positions no mrope shape is {input_positions_tensor.shape}")
-
 
         slot_mapping = make_tensor_with_pad(slot_mapping,
                                             max_len=max_prompt_len,
@@ -1311,21 +1326,19 @@ def _prepare_decode(
 
                 seq_len = seq_data.get_len()
                 position = seq_len - 1
-
                 input_positions.append([position])
 
-                if seq_data.mrope_position_delta is not None:
-                    context_len = seq_data.get_num_computed_tokens()
-                    pos_for_mrope = MRotaryEmbedding.get_next_input_positions(
-                        seq_data.mrope_position_delta,
-                        context_len,
-                        seq_len,
-                    )
-                else:
-                    pos_for_mrope = [[position]] * 3
-
-                for idx in range(3):
-                    input_mrope_positions[idx].extend(pos_for_mrope[idx])
+                if self.model_is_mrope:
+                    if seq_data.mrope_position_delta is not None:
+                        pos_for_mrope = MRotaryEmbedding \
+                            .get_next_input_positions(
+                                seq_data.mrope_position_delta,
+                                seq_data.get_num_computed_tokens(),
+                                seq_len)
+                    else:
+                        pos_for_mrope = [[position]] * 3
+                    for idx in range(3):
+                        input_mrope_positions[idx].extend(pos_for_mrope[idx])
 
                 seq_len = seq_len if self.sliding_window is None else min(
                     seq_len, self.sliding_window)
@@ -1602,10 +1615,11 @@ def prepare_input_tensors(
             decode_slot_mapping,
             decode_lora_ids,
         ) = self._prepare_decode(decode_reqs)
-        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
-                                                     seq_lens, query_lens,
-                                                     self.device,
-                                                     self.pin_memory)
+
+        if not self.is_pooler:
+            sampling_metadata = SamplingMetadata.prepare(
+                seq_group_metadata_list, seq_lens, query_lens, self.device,
+                self.pin_memory)
 
         if not self.scheduler_config.chunked_prefill_enabled:
             assert (len(prefill_reqs) and len(decode_reqs)) == 0
@@ -1776,13 +1790,6 @@ def create_dummy_seq_group_metadata(self,
             sampling_params = SamplingParams(temperature=temperature)
             num_blocks = math.ceil(seq_len / self.block_size)
         seq_len = max(seq_len, 1)
-        # TODO: Add dummy data with metadata info
-        # encoder_dummy_data \
-        #     = self.input_registry.dummy_data_for_profiling(
-        #         self.model_config,
-        #                                 seq_len,
-        #                                 self.mm_registry,
-        #                                 is_encoder_data=True)        
         if is_prompt:
             input_len = seq_len
             output_len = 0

From dcc2c6c12c92d2a638c91bf46563a99ef3b43b35 Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Wed, 26 Feb 2025 18:59:49 +0000
Subject: [PATCH 19/34] add type ignore

---
 vllm/worker/hpu_model_runner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 8273933c3926f..8485a51dfc3e9 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -236,7 +236,7 @@ def build_and_pad_mrope_positions(input_positions: List[List[int]],
     # Qwen2.5vl expects 3 lists of positions, we are going to pad each
     # seq_data in the list using either MRope values for multi-modal
     # or regular position for text only inputs
-    mrope_input_positions = [[] for _ in range(3)]
+    mrope_input_positions: List[List[int]] = [[] for _ in range(3)]
     for idx in range(3):
         for b_idx, input_mrope_position in enumerate(input_mrope_positions):
             if input_mrope_position is not None:
@@ -1377,9 +1377,9 @@ def _prepare_decode(
 
 
         if self.model_is_mrope:
-            input_positions = None
+            input_positions = None  # type: ignore
         else:
-            input_mrope_positions = None
+            input_mrope_positions = None  # type: ignore
 
         input_positions = torch.tensor(input_positions or input_mrope_positions,
                                        dtype=torch.long,

From 7c5871b1a4f2dc2ebe9b42edd48df5fca05505ec Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Wed, 26 Feb 2025 22:59:32 +0000
Subject: [PATCH 20/34] set HPU_DISABLE_TENSOR_CACHE to false for Qwen2.5vl

---
 vllm/worker/hpu_model_runner.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 8485a51dfc3e9..c4a557cfb0a81 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -910,6 +910,11 @@ def _add_dummy_seq(self, seq_group_metadata_list, is_prompt):
 
     def _maybe_wrap_in_hpu_graph(self, *args, **kwargs):
         disable_tensor_cache = get_hpu_disable_tensor_cache()
+        if self.model_is_mrope:
+            logger.warning(
+                "Setting HPU_DISABLE_TENSOR_CACHE to False for this model"
+                )
+            disable_tensor_cache = False
         return htorch.hpu.wrap_in_hpu_graph(
             HpuModelAdapter(*args, **kwargs),
             disable_tensor_cache=disable_tensor_cache,

From fc9e7eeaaa43fc82d9beaf8a08495f36c0724f93 Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Wed, 26 Feb 2025 23:50:00 +0000
Subject: [PATCH 21/34] make lint happy?

---
 vllm/worker/hpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index c4a557cfb0a81..d7bebb007dd18 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1066,7 +1066,7 @@ def _prepare_prompt(
             # is always the first token in the sequence.
             input_positions.append(list(range(context_len, seq_len)))
 
-            seq_data_mrope_positions = None
+            seq_data_mrope_positions : Optional[List[List[int]]] = None
             if seq_group_metadata.multi_modal_data:
                 positions = input_positions[0]
                 mm_data, placeholder_maps = MultiModalPlaceholderMap \

From 67b696eb44af9102e4451f10781d7e964f448e31 Mon Sep 17 00:00:00 2001
From: Jimin Ha <jha@habana.ai>
Date: Wed, 26 Feb 2025 11:48:32 -0800
Subject: [PATCH 22/34] Change torch dtype to bflat16 for qwen2.5-VL test

---
 tests/models/decoder_only/vision_language/test_models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 2c66edb539dce..48159a33da32f 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -156,6 +156,7 @@
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        dtype=("bfloat16" if current_platform.is_hpu() else "half")
     ),
     #### Extended model tests
     "aria": VLMTestInfo(

From cf97bed89d0944ccc2b736a9a3a1803224168ff3 Mon Sep 17 00:00:00 2001
From: Iman Gohari <s.m.iman.gohari@intel.com>
Date: Thu, 27 Feb 2025 00:41:13 +0000
Subject: [PATCH 23/34] fea(): Added the tests requirements

Co-authored-by: Mohit Deopujari mohit.deopujari@intel.com
Co-authored-by: Jimin Ha jimin.ha@intel.com
Co-authored-by: Pallavi Jaini pallavi.jaini@intel.com
Co-authored-by: Deepak Narayana deepak.narayana@intel.com
Co-authored-by: Sayantan Sarkar sayantan.sarkar@intel.com
Co-authored-by: Gustavo Malkomes gustavo.malkomes@intel.com
---
 requirements-hpu-qwen2_5_vl.txt                             | 1 +
 tests/conftest.py                                           | 5 ++++-
 tests/models/decoder_only/vision_language/vlm_utils/core.py | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)
 create mode 100644 requirements-hpu-qwen2_5_vl.txt

diff --git a/requirements-hpu-qwen2_5_vl.txt b/requirements-hpu-qwen2_5_vl.txt
new file mode 100644
index 0000000000000..21bcfbfe0b11c
--- /dev/null
+++ b/requirements-hpu-qwen2_5_vl.txt
@@ -0,0 +1 @@
+transformers @ git+https://github.com/huggingface/transformers.git@6b550462139655d488d4c663086a63e98713c6b9
diff --git a/tests/conftest.py b/tests/conftest.py
index 7fa6a35317ff7..060034d878ae0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -258,7 +258,10 @@ def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
             return x
 
         if device is None:
-            device = "cpu" if current_platform.is_cpu() else "cuda"
+            if current_platform.is_hpu():
+                device = "hpu"
+            else:
+                device = "cpu" if current_platform.is_cpu() else "cuda"
 
         if isinstance(x, dict):
             return {k: self.wrap_device(v, device) for k, v in x.items()}
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py
index f2260f56737d9..d688728ce8664 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -51,7 +51,7 @@ def run_test(
 
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
-    model_info.check_transformers_version(on_fail="skip")
+    #model_info.check_transformers_version(on_fail="skip")
 
     vllm_outputs_per_mm = []
     hf_outputs_per_mm = []

From c986f8da1cac5ff617f01dcf5b2f6876f77ac735 Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Thu, 27 Feb 2025 03:57:57 +0000
Subject: [PATCH 24/34] add check_transformers to qwen2_5_VL

---
 tests/conftest.py                                           | 4 +++-
 tests/models/decoder_only/vision_language/vlm_utils/core.py | 2 +-
 tests/models/registry.py                                    | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 060034d878ae0..6700b7ca6d08d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -260,8 +260,10 @@ def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
         if device is None:
             if current_platform.is_hpu():
                 device = "hpu"
+            elif current_platform.is_cpu():
+                device = "cpu"
             else:
-                device = "cpu" if current_platform.is_cpu() else "cuda"
+                device = "cuda"
 
         if isinstance(x, dict):
             return {k: self.wrap_device(v, device) for k, v in x.items()}
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py
index d688728ce8664..f2260f56737d9 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -51,7 +51,7 @@ def run_test(
 
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
-    #model_info.check_transformers_version(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
 
     vllm_outputs_per_mm = []
     hf_outputs_per_mm = []
diff --git a/tests/models/registry.py b/tests/models/registry.py
index c3e1c7859799c..cda981f86b7ad 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -278,7 +278,7 @@ def check_available_online(
     "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"),  # noqa: E501
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
     "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501
-                                                          min_transformers_version="4.49"),  # noqa: E501
+                                                          min_transformers_version="4.48.9"),  # noqa: E501
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",
                                      trust_remote_code=True),
     # [Encoder-decoder]

From 08b35bffc7f8cb8e32c6ddff4569550234dba6a6 Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Thu, 27 Feb 2025 06:04:06 +0000
Subject: [PATCH 25/34] improving code and comments

---
 vllm/model_executor/models/qwen2_5_vl.py |  3 +-
 vllm/worker/hpu_model_runner.py          | 44 ++++++++++++++----------
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 2c7f0807c828b..5049f33d19cce 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -621,7 +621,8 @@ def remove_duplicates_cpu(a):
             cu_window_seqlens,
             device=hidden_states.device,
             dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32)
-        # This is not a static operation, removing duplicates earlier on CPU
+        # NOTE: unique_consecutive is a dynamic operation
+        # we are replacing it with the `remove_duplicates_cpu` above
         #cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
 
         seq_len, _ = hidden_states.size()
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index d7bebb007dd18..24bb3b7f11658 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -230,27 +230,35 @@ def find_rope_layer(parent, path):
     # Return the result if found, otherwise None
     return path_to_rope
 
-def build_and_pad_mrope_positions(input_positions: List[List[int]],
+def make_mrope_positions_tensor_with_pad(input_positions: List[List[int]],
                                   input_mrope_positions: List[List[List[int]]],
-                                  max_prompt_len) -> Optional[List[List[int]]]:
-    # Qwen2.5vl expects 3 lists of positions, we are going to pad each
-    # seq_data in the list using either MRope values for multi-modal
-    # or regular position for text only inputs
+                                  max_prompt_len: int,
+                                  pad: int) -> Optional[List[List[int]]]:
+    # If no mrope positions, returns a flatten (seq_len,)
+    if all(mrope_position is None for mrope_position in input_mrope_positions):
+        return make_tensor_with_pad(input_positions,
+                                    max_len=max_prompt_len,
+                                    pad=0,
+                                    dtype=torch.long,
+                                    device='cpu').flatten()
+    # Otherwise, Qwen2.5-VL expects positions in a (3, seq_len)
+    # we are going to pad each seq_data in the list
+    # using either MRope values or regular position
     mrope_input_positions: List[List[int]] = [[] for _ in range(3)]
     for idx in range(3):
         for b_idx, input_mrope_position in enumerate(input_mrope_positions):
             if input_mrope_position is not None:
                 positions = input_mrope_position[idx]
             else:
-                # use regular positions as default
-                positions = input_positions[b_idx]  
-            padded_positions = make_tensor_with_pad([positions],
-                                                max_len=max_prompt_len,
-                                                pad=0,
-                                                dtype=torch.long,
-                                                device='cpu').flatten().tolist()
+                positions = input_positions[b_idx]
+            padding_size = max_prompt_len - len(positions)
+            assert padding_size >= 0
+            padded_positions = positions \
+                + (max_prompt_len - len(positions)) * [pad]
             mrope_input_positions[idx].extend(padded_positions)
-    return mrope_input_positions
+    return torch.tensor(mrope_input_positions,
+                        dtype=torch.long,
+                        device='cpu')
 
 class HpuModelAdapter:
 
@@ -1189,14 +1197,12 @@ def _prepare_prompt(
                                                    device='cpu')
 
         if self.model_is_mrope:
-            padded_input_mrope_positions = \
-                build_and_pad_mrope_positions(input_positions=input_positions,
+            input_positions_tensor = \
+                make_mrope_positions_tensor_with_pad(input_positions=input_positions,
                                               input_mrope_positions=input_mrope_positions,
-                                              max_prompt_len=max_prompt_len)
+                                              max_prompt_len=max_prompt_len,
+                                              pad=0)
             input_positions = None  # type: ignore
-            input_positions_tensor = torch.tensor(padded_input_mrope_positions,
-                                                  dtype=torch.long,
-                                                  device='cpu')
         else:
             input_mrope_positions = None  # type: ignore
             input_positions_tensor = make_tensor_with_pad(input_positions,

From 75eb21bba2536494a708a6b5fd121cb708966d7c Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Thu, 27 Feb 2025 22:50:09 +0000
Subject: [PATCH 26/34] lint

---
 vllm/worker/hpu_model_runner.py | 43 ++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 24bb3b7f11658..aa126de71c126 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -42,8 +42,8 @@
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
@@ -169,10 +169,12 @@ def get_target_layer_suffix_list(model_type) -> list[str]:
         decoder_layer_table.get(model_type, "DecoderLayer"), "EncoderLayer"
     ]
 
+
 def get_hpu_disable_tensor_cache():
     env_var = os.environ.get('HPU_DISABLE_TENSOR_CACHE', 'true')
     return env_var.lower() == 'true'
 
+
 def modify_model_layers(module: torch.nn.Module,
                         suffix_list: list[str],
                         n=1,
@@ -230,10 +232,11 @@ def find_rope_layer(parent, path):
     # Return the result if found, otherwise None
     return path_to_rope
 
-def make_mrope_positions_tensor_with_pad(input_positions: List[List[int]],
-                                  input_mrope_positions: List[List[List[int]]],
-                                  max_prompt_len: int,
-                                  pad: int) -> Optional[List[List[int]]]:
+
+def make_mrope_positions_tensor_with_pad(
+        input_positions: List[List[int]],
+        input_mrope_positions: List[List[List[int]]], max_prompt_len: int,
+        pad: int) -> Optional[List[List[int]]]:
     # If no mrope positions, returns a flatten (seq_len,)
     if all(mrope_position is None for mrope_position in input_mrope_positions):
         return make_tensor_with_pad(input_positions,
@@ -256,9 +259,8 @@ def make_mrope_positions_tensor_with_pad(input_positions: List[List[int]],
             padded_positions = positions \
                 + (max_prompt_len - len(positions)) * [pad]
             mrope_input_positions[idx].extend(padded_positions)
-    return torch.tensor(mrope_input_positions,
-                        dtype=torch.long,
-                        device='cpu')
+    return torch.tensor(mrope_input_positions, dtype=torch.long, device='cpu')
+
 
 class HpuModelAdapter:
 
@@ -455,7 +457,7 @@ def _prepare_cos_sin(self, positions):
                 positions, recompute_cos_sin=self.recompute_cos_sin)
         else:
             raise AttributeError(
-               "The module at the end of the path does not have \
+                "The module at the end of the path does not have \
                a 'prepare_cos_sin' method.")
 
     def forward(self, *args, **kwargs):
@@ -920,8 +922,7 @@ def _maybe_wrap_in_hpu_graph(self, *args, **kwargs):
         disable_tensor_cache = get_hpu_disable_tensor_cache()
         if self.model_is_mrope:
             logger.warning(
-                "Setting HPU_DISABLE_TENSOR_CACHE to False for this model"
-                )
+                "Setting HPU_DISABLE_TENSOR_CACHE to False for this model")
             disable_tensor_cache = False
         return htorch.hpu.wrap_in_hpu_graph(
             HpuModelAdapter(*args, **kwargs),
@@ -1074,7 +1075,7 @@ def _prepare_prompt(
             # is always the first token in the sequence.
             input_positions.append(list(range(context_len, seq_len)))
 
-            seq_data_mrope_positions : Optional[List[List[int]]] = None
+            seq_data_mrope_positions: Optional[List[List[int]]] = None
             if seq_group_metadata.multi_modal_data:
                 positions = input_positions[0]
                 mm_data, placeholder_maps = MultiModalPlaceholderMap \
@@ -1109,7 +1110,8 @@ def _prepare_prompt(
                     multi_modal_placeholder_maps[modality].extend(
                         placeholder_map)
 
-            input_mrope_positions.append(seq_data_mrope_positions)
+            input_mrope_positions.append(
+                seq_data_mrope_positions)  # type: ignore
 
             if seq_group_metadata.block_tables is None:
                 # During memory profiling, the block tables are not initialized
@@ -1205,11 +1207,12 @@ def _prepare_prompt(
             input_positions = None  # type: ignore
         else:
             input_mrope_positions = None  # type: ignore
-            input_positions_tensor = make_tensor_with_pad(input_positions,
-                                                max_len=max_prompt_len,
-                                                pad=0,
-                                                dtype=torch.long,
-                                                device='cpu')
+            input_positions_tensor = make_tensor_with_pad(
+                input_positions,
+                max_len=max_prompt_len,
+                pad=0,
+                dtype=torch.long,
+                device='cpu')
 
         slot_mapping = make_tensor_with_pad(slot_mapping,
                                             max_len=max_prompt_len,
@@ -1386,13 +1389,13 @@ def _prepare_decode(
             real_batch_size = len(seq_group_metadata_list)
             input_tokens = output[:real_batch_size].clone()
 
-
         if self.model_is_mrope:
             input_positions = None  # type: ignore
         else:
             input_mrope_positions = None  # type: ignore
 
-        input_positions = torch.tensor(input_positions or input_mrope_positions,
+        input_positions = torch.tensor(input_positions
+                                       or input_mrope_positions,
                                        dtype=torch.long,
                                        device='cpu')
 

From 70ef9404f48cbf0551ebe05286f817d58c7e8279 Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Thu, 27 Feb 2025 23:07:22 +0000
Subject: [PATCH 27/34] remove Optinal

---
 vllm/worker/hpu_model_runner.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index aa126de71c126..5b0990fe03d1e 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -233,10 +233,11 @@ def find_rope_layer(parent, path):
     return path_to_rope
 
 
-def make_mrope_positions_tensor_with_pad(
+def make_mrope_positions_tensor_with_pad( \
         input_positions: List[List[int]],
-        input_mrope_positions: List[List[List[int]]], max_prompt_len: int,
-        pad: int) -> Optional[List[List[int]]]:
+        input_mrope_positions: List[List[List[int]]],
+        max_prompt_len: int,
+        pad: int) -> List[List[int]]:
     # If no mrope positions, returns a flatten (seq_len,)
     if all(mrope_position is None for mrope_position in input_mrope_positions):
         return make_tensor_with_pad(input_positions,

From 15d735c002df690e60f48ac43e854a1f56ee6714 Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Thu, 27 Feb 2025 23:17:18 +0000
Subject: [PATCH 28/34] lint qwen2_5_vl

---
 vllm/model_executor/models/qwen2_5_vl.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 5049f33d19cce..8ce5437cae7b8 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -612,10 +612,10 @@ def forward(
 
         # windows attention
         window_index, cu_window_seqlens = self.get_window_index(grid_thw)
+
         def remove_duplicates_cpu(a):
-            return [
-                a[i] for i in range(len(a)) if i==0 or a[i-1]!= a[i]
-            ]
+            return [a[i] for i in range(len(a)) if i == 0 or a[i - 1] != a[i]]
+
         cu_window_seqlens = remove_duplicates_cpu(cu_window_seqlens)
         cu_window_seqlens = torch.tensor(
             cu_window_seqlens,

From f6b95f8a43a2221fb7eb9718fa5cf8a4ca7978bf Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Fri, 28 Feb 2025 16:25:30 +0000
Subject: [PATCH 29/34] add reviewers suggestions

---
 README_GAUDI.md                          |  2 +-
 vllm/model_executor/models/qwen2_5_vl.py | 31 +++++++++++++++---------
 vllm/worker/hpu_model_runner.py          | 12 +--------
 3 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/README_GAUDI.md b/README_GAUDI.md
index ce3b263aa46f9..a355b0638ecc8 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -372,7 +372,7 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM
 
 - `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used. `1` is the default.
 - `PT_HPU_ENABLE_LAZY_COLLECTIVES` must be set to `true` for tensor parallel inference with HPU Graphs.
-- `PT_HPUGRAPH_DISABLE_TENSOR_CACHE` must be set to `false` for llava model.
+- `PT_HPUGRAPH_DISABLE_TENSOR_CACHE` must be set to `false` for llava and qwen models.
 - `VLLM_PROMPT_USE_FLEX_ATTENTION` is enabled only for llama model, and allows to use torch.nn.attention.flex_attention instead of FusedSDPA. Note, this requires `VLLM_PROMPT_USE_FUSEDSDPA=0`
 
 # Quantization, FP8 Inference and Model Calibration Process
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 8ce5437cae7b8..54b4e214e4ed0 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -57,7 +57,7 @@
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig
-from vllm.platforms import _Backend
+from vllm.platforms import _Backend, current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
 
@@ -71,6 +71,7 @@
 from .vision import get_vit_attn_backend
 
 logger = init_logger(__name__)
+is_hpu = current_platform.is_hpu()
 
 # === Vision Inputs === #
 
@@ -613,17 +614,23 @@ def forward(
         # windows attention
         window_index, cu_window_seqlens = self.get_window_index(grid_thw)
 
-        def remove_duplicates_cpu(a):
-            return [a[i] for i in range(len(a)) if i == 0 or a[i - 1] != a[i]]
-
-        cu_window_seqlens = remove_duplicates_cpu(cu_window_seqlens)
-        cu_window_seqlens = torch.tensor(
-            cu_window_seqlens,
-            device=hidden_states.device,
-            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32)
-        # NOTE: unique_consecutive is a dynamic operation
-        # we are replacing it with the `remove_duplicates_cpu` above
-        #cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+        if is_hpu:
+            # NOTE: unique_consecutive is a dynamic operation
+            # we are using `remove_duplicates_cpu` instead
+            def remove_duplicates_cpu(a):
+                return [a[i] for i in range(len(a)) if i == 0 or a[i - 1] != a[i]]
+
+            cu_window_seqlens = remove_duplicates_cpu(cu_window_seqlens)
+            cu_window_seqlens = torch.tensor(
+                cu_window_seqlens,
+                device=hidden_states.device,
+                dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32)
+        else:
+            cu_window_seqlens = torch.tensor(
+                cu_window_seqlens,
+                device=hidden_states.device,
+                dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32)
+            cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
 
         seq_len, _ = hidden_states.size()
         hidden_states = hidden_states.reshape(
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 5b0990fe03d1e..1c2745c13f909 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -170,11 +170,6 @@ def get_target_layer_suffix_list(model_type) -> list[str]:
     ]
 
 
-def get_hpu_disable_tensor_cache():
-    env_var = os.environ.get('HPU_DISABLE_TENSOR_CACHE', 'true')
-    return env_var.lower() == 'true'
-
-
 def modify_model_layers(module: torch.nn.Module,
                         suffix_list: list[str],
                         n=1,
@@ -920,14 +915,9 @@ def _add_dummy_seq(self, seq_group_metadata_list, is_prompt):
         return seq_group_metadata_list, real_batch_size, batch_size_padded
 
     def _maybe_wrap_in_hpu_graph(self, *args, **kwargs):
-        disable_tensor_cache = get_hpu_disable_tensor_cache()
-        if self.model_is_mrope:
-            logger.warning(
-                "Setting HPU_DISABLE_TENSOR_CACHE to False for this model")
-            disable_tensor_cache = False
         return htorch.hpu.wrap_in_hpu_graph(
             HpuModelAdapter(*args, **kwargs),
-            disable_tensor_cache=disable_tensor_cache,
+            disable_tensor_cache=True,
         ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(
             *args, **kwargs)
 

From 175a927d51d2c163a6059a47f766ed9a2a86901a Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Fri, 28 Feb 2025 16:39:54 +0000
Subject: [PATCH 30/34] lint

---
 vllm/model_executor/models/qwen2_5_vl.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 54b4e214e4ed0..e04a718da4f65 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -618,18 +618,24 @@ def forward(
             # NOTE: unique_consecutive is a dynamic operation
             # we are using `remove_duplicates_cpu` instead
             def remove_duplicates_cpu(a):
-                return [a[i] for i in range(len(a)) if i == 0 or a[i - 1] != a[i]]
+                return [
+                    a[i] for i in range(len(a)) if i == 0 or a[i - 1] != a[i]
+                ]
 
             cu_window_seqlens = remove_duplicates_cpu(cu_window_seqlens)
             cu_window_seqlens = torch.tensor(
                 cu_window_seqlens,
                 device=hidden_states.device,
-                dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32)
+                dtype=grid_thw.dtype
+                if torch.jit.is_tracing() else torch.int32)
+
         else:
             cu_window_seqlens = torch.tensor(
                 cu_window_seqlens,
                 device=hidden_states.device,
-                dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32)
+                dtype=grid_thw.dtype
+                if torch.jit.is_tracing() else torch.int32)
+
             cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
 
         seq_len, _ = hidden_states.size()

From 5baa1ed6951d53d143ace65ac5ceebfde06d41fd Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Fri, 28 Feb 2025 16:54:48 +0000
Subject: [PATCH 31/34] remove blank line

---
 vllm/model_executor/models/qwen2_5_vl.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index e04a718da4f65..c7fa789b556d5 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -635,7 +635,6 @@ def remove_duplicates_cpu(a):
                 device=hidden_states.device,
                 dtype=grid_thw.dtype
                 if torch.jit.is_tracing() else torch.int32)
-
             cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
 
         seq_len, _ = hidden_states.size()

From 7fe109a7e292e1b9ee18087bbc22df0b3806134a Mon Sep 17 00:00:00 2001
From: Gustavo Malkomes <gustavo.malkomes@intel.com>
Date: Mon, 10 Mar 2025 15:56:15 +0000
Subject: [PATCH 32/34] input_mrope_positions if/else simplifications

---
 vllm/utils.py                   | 30 +++++++++++++++++++
 vllm/worker/hpu_model_runner.py | 52 +++++----------------------------
 2 files changed, 38 insertions(+), 44 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 216808f51e01d..da79625572cde 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -823,6 +823,36 @@ def make_tensor_with_pad(
     return tensor
 
 
+def make_mrope_positions_tensor_with_pad( \
+        input_positions: List[List[int]],
+        input_mrope_positions: List[List[List[int]]],
+        max_prompt_len: int,
+        pad: int) -> List[List[int]]:
+    # If no mrope positions, returns a flatten (seq_len,)
+    if all(mrope_position is None for mrope_position in input_mrope_positions):
+        return make_tensor_with_pad(input_positions,
+                                    max_len=max_prompt_len,
+                                    pad=0,
+                                    dtype=torch.long,
+                                    device='cpu').flatten()
+    # Otherwise, Qwen2.5-VL expects positions in a (3, seq_len)
+    # we are going to pad each seq_data in the list
+    # using either MRope values or regular position
+    mrope_input_positions: List[List[int]] = [[] for _ in range(3)]
+    for idx in range(3):
+        for b_idx, input_mrope_position in enumerate(input_mrope_positions):
+            if input_mrope_position is not None:
+                positions = input_mrope_position[idx]
+            else:
+                positions = input_positions[b_idx]
+            padding_size = max_prompt_len - len(positions)
+            assert padding_size >= 0
+            padded_positions = positions \
+                + (max_prompt_len - len(positions)) * [pad]
+            mrope_input_positions[idx].extend(padded_positions)
+    return torch.tensor(mrope_input_positions, dtype=torch.long, device='cpu')
+
+
 def make_tensor_with_pad_align(
     x: List[List[T]],
     pad: T,
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 1c2745c13f909..32d993c724f90 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -59,7 +59,7 @@
                            SequenceOutput)
 from vllm.transformers_utils.config import uses_mrope
 from vllm.utils import (bind_kv_cache, is_fake_hpu, is_pin_memory_available,
-                        make_tensor_with_pad)
+                        make_tensor_with_pad, make_mrope_positions_tensor_with_pad)
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase,
     _add_attn_metadata_broadcastable_dict,
@@ -228,36 +228,6 @@ def find_rope_layer(parent, path):
     return path_to_rope
 
 
-def make_mrope_positions_tensor_with_pad( \
-        input_positions: List[List[int]],
-        input_mrope_positions: List[List[List[int]]],
-        max_prompt_len: int,
-        pad: int) -> List[List[int]]:
-    # If no mrope positions, returns a flatten (seq_len,)
-    if all(mrope_position is None for mrope_position in input_mrope_positions):
-        return make_tensor_with_pad(input_positions,
-                                    max_len=max_prompt_len,
-                                    pad=0,
-                                    dtype=torch.long,
-                                    device='cpu').flatten()
-    # Otherwise, Qwen2.5-VL expects positions in a (3, seq_len)
-    # we are going to pad each seq_data in the list
-    # using either MRope values or regular position
-    mrope_input_positions: List[List[int]] = [[] for _ in range(3)]
-    for idx in range(3):
-        for b_idx, input_mrope_position in enumerate(input_mrope_positions):
-            if input_mrope_position is not None:
-                positions = input_mrope_position[idx]
-            else:
-                positions = input_positions[b_idx]
-            padding_size = max_prompt_len - len(positions)
-            assert padding_size >= 0
-            padded_positions = positions \
-                + (max_prompt_len - len(positions)) * [pad]
-            mrope_input_positions[idx].extend(padded_positions)
-    return torch.tensor(mrope_input_positions, dtype=torch.long, device='cpu')
-
-
 class HpuModelAdapter:
 
     def __init__(self, model, vllm_config, layer_names):
@@ -1190,15 +1160,13 @@ def _prepare_prompt(
                                                    device='cpu')
 
         if self.model_is_mrope:
-            input_positions_tensor = \
+            input_positions = \
                 make_mrope_positions_tensor_with_pad(input_positions=input_positions,
                                               input_mrope_positions=input_mrope_positions,
                                               max_prompt_len=max_prompt_len,
                                               pad=0)
-            input_positions = None  # type: ignore
         else:
-            input_mrope_positions = None  # type: ignore
-            input_positions_tensor = make_tensor_with_pad(
+            input_positions = make_tensor_with_pad(
                 input_positions,
                 max_len=max_prompt_len,
                 pad=0,
@@ -1232,7 +1200,7 @@ def _prepare_prompt(
                 self.device, non_blocking=True)
         input_tokens_tensor = input_tokens_tensor.to(  # type: ignore
             self.device, non_blocking=True)
-        input_positions_tensor = input_positions_tensor.to(  # type: ignore
+        input_positions = input_positions.to(  # type: ignore
             self.device, non_blocking=True)
         slot_mapping = slot_mapping.to(  # type: ignore
             self.device, non_blocking=True)
@@ -1267,7 +1235,7 @@ def _prepare_prompt(
                     self.device, non_blocking=True)
 
         return PreparePromptMetadata(input_tokens=input_tokens_tensor,
-                                     input_positions=input_positions_tensor,
+                                     input_positions=input_positions,
                                      attn_metadata=attn_metadata,
                                      seq_lens=seq_lens,
                                      query_lens=query_lens,
@@ -1380,13 +1348,9 @@ def _prepare_decode(
             real_batch_size = len(seq_group_metadata_list)
             input_tokens = output[:real_batch_size].clone()
 
-        if self.model_is_mrope:
-            input_positions = None  # type: ignore
-        else:
-            input_mrope_positions = None  # type: ignore
-
-        input_positions = torch.tensor(input_positions
-                                       or input_mrope_positions,
+        input_positions = torch.tensor(input_mrope_positions
+                                       if self.model_is_mrope
+                                       else input_positions,
                                        dtype=torch.long,
                                        device='cpu')
 

From 264676d31f71d48afd99fb546c892a148aae57d6 Mon Sep 17 00:00:00 2001
From: Jimin Ha <jha@habana.ai>
Date: Mon, 10 Mar 2025 09:37:24 -0700
Subject: [PATCH 33/34] Enable FusedSDPA for Qwen2.5 VL

---
 vllm/model_executor/models/qwen2_5_vl.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index c7fa789b556d5..0b3f9014568b9 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -313,10 +313,14 @@ def forward(
                 v_i = v[:, start_idx:end_idx]
                 q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
                                  for x in [q_i, k_i, v_i])
-                output_i = F.scaled_dot_product_attention(q_i,
-                                                          k_i,
-                                                          v_i,
-                                                          dropout_p=0.0)
+                if is_hpu:
+                    from habana_frameworks.torch.hpex.kernels import FusedSDPA
+                    output_i = FusedSDPA.apply(q_i, k_i, v_i, None, 0.0)
+                else:
+                    output_i = F.scaled_dot_product_attention(q_i,
+                                                              k_i,
+                                                              v_i,
+                                                              dropout_p=0.0)
                 output_i = rearrange(output_i, "b h s d -> b s h d ")
                 outputs.append(output_i)
             context_layer = torch.cat(outputs, dim=1)

From cb09a4b266d0a45ab44cbbb662b64b594fab86fb Mon Sep 17 00:00:00 2001
From: Jimin Ha <jimin.ha@intel.com>
Date: Mon, 10 Mar 2025 18:03:02 +0000
Subject: [PATCH 34/34] Lint fix

---
 vllm/worker/hpu_model_runner.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 32d993c724f90..adef55180cdc7 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -59,7 +59,8 @@
                            SequenceOutput)
 from vllm.transformers_utils.config import uses_mrope
 from vllm.utils import (bind_kv_cache, is_fake_hpu, is_pin_memory_available,
-                        make_tensor_with_pad, make_mrope_positions_tensor_with_pad)
+                        make_mrope_positions_tensor_with_pad,
+                        make_tensor_with_pad)
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase,
     _add_attn_metadata_broadcastable_dict,
@@ -1166,12 +1167,11 @@ def _prepare_prompt(
                                               max_prompt_len=max_prompt_len,
                                               pad=0)
         else:
-            input_positions = make_tensor_with_pad(
-                input_positions,
-                max_len=max_prompt_len,
-                pad=0,
-                dtype=torch.long,
-                device='cpu')
+            input_positions = make_tensor_with_pad(input_positions,
+                                                   max_len=max_prompt_len,
+                                                   pad=0,
+                                                   dtype=torch.long,
+                                                   device='cpu')
 
         slot_mapping = make_tensor_with_pad(slot_mapping,
                                             max_len=max_prompt_len,
@@ -1348,11 +1348,10 @@ def _prepare_decode(
             real_batch_size = len(seq_group_metadata_list)
             input_tokens = output[:real_batch_size].clone()
 
-        input_positions = torch.tensor(input_mrope_positions
-                                       if self.model_is_mrope
-                                       else input_positions,
-                                       dtype=torch.long,
-                                       device='cpu')
+        input_positions = torch.tensor(
+            input_mrope_positions if self.model_is_mrope else input_positions,
+            dtype=torch.long,
+            device='cpu')
 
         num_decode_tokens = len(seq_lens)