From 938ef83ac4288b77d43b72fd7cf8ff700c38639b Mon Sep 17 00:00:00 2001 From: Sayantan Sarkar Date: Thu, 6 Feb 2025 07:38:41 +0000 Subject: [PATCH 01/34] Initial commit Fails in rotary_embed layer in the view --- .../model_executor/layers/rotary_embedding.py | 20 +++++ vllm/worker/hpu_model_runner.py | 86 +++++++++++++++++-- 2 files changed, 100 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 6ebab8927a92b..49d6b0ac13134 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -849,6 +849,26 @@ def forward( dim=-1) query_shape = query.shape + breakpoint() + ''' + in CPU: + + query.shape + torch.Size([1451, 3584]) + + num_tokens + 1451 + (Pdb) self.head_size + 128 + + on HPU: + query.shape + torch.Size([32, 1024, 3584]) + (Pdb) num_tokens + 1024 + (Pdb) self.head_size + 128 + ''' query = query.view(num_tokens, -1, self.head_size) query_rot = query[..., :self.rotary_dim] query_pass = query[..., self.rotary_dim:] diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 276685274b957..ed60f836ed51a 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -43,6 +43,7 @@ from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -421,9 +422,11 @@ def _prepare_cos_sin(self, positions): current_module.prepare_cos_sin( positions, recompute_cos_sin=self.recompute_cos_sin) else: - raise AttributeError( - "The module at the end of the path does not have \ - a 'prepare_cos_sin' method.") + pass + # dont raise error for qwen2.5-vl + #raise AttributeError( + # "The module at the end of the path does not have \ + # a 'prepare_cos_sin' method.") def forward(self, *args, **kwargs): kwargs = kwargs.copy() @@ -759,6 +762,15 @@ def _set_gc_threshold(self) -> None: self.skip_warmup = os.environ.get('VLLM_SKIP_WARMUP', 'false').lower() == 'true' + @property + def model_is_mrope(self) -> bool: + """Detect if the model has "mrope" rope_scaling type. + mrope requires keep "rope_deltas" between prompt and decoding phases.""" + rope_scaling = getattr(self.model_config.hf_config, "rope_scaling", {}) + if rope_scaling is None: + return False + return rope_scaling.get("type", None) == "mrope" + def load_model(self) -> None: import habana_frameworks.torch.core as htcore if self.model_config.quantization == 'inc' or \ @@ -935,6 +947,7 @@ def _prepare_prompt( ) -> PreparePromptMetadata: input_tokens: List[List[int]] = [] input_positions: List[List[int]] = [] + input_mrope_positions: List[List[int]] = [[] for _ in range(3)] slot_mapping: List[List[int]] = [] lora_index_mapping: List[List[int]] = [] lora_prompt_mapping: List[List[int]] = [] @@ -1019,6 +1032,37 @@ def _prepare_prompt( seq_group_metadata.mm_processor_kwargs, ) + mrope_positions = None + if self.runner.model_is_mrope: + image_grid_thw = mm_kwargs.get("image_grid_thw", None) + video_grid_thw = mm_kwargs.get("video_grid_thw", None) + assert image_grid_thw is not None or video_grid_thw is not None, ( + "mrope embedding type requires multi-modal input mapper " + "returns 'image_grid_thw' or 'video_grid_thw'.") + + hf_config = self.runner.model_config.hf_config + token_ids = seq_data.get_token_ids() + mrope_positions, mrope_position_delta = \ + MRotaryEmbedding.get_input_positions( + token_ids, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + image_token_id=hf_config.image_token_id, + video_token_id=hf_config.video_token_id, + vision_start_token_id=hf_config.vision_start_token_id, + vision_end_token_id=hf_config.vision_end_token_id, + spatial_merge_size=hf_config.vision_config. + spatial_merge_size, + context_len=computed_len, + ) + seq_data.mrope_position_delta = mrope_position_delta + if mrope_positions: + for idx in range(3): + input_mrope_positions[idx].extend(mrope_positions[idx]) + else: + input_positions.extend(list(range(computed_len, seq_len))) + + multi_modal_kwargs_list.append(mm_kwargs) for modality, placeholder_map in placeholder_maps.items(): @@ -1058,6 +1102,11 @@ def _prepare_prompt( slot = block_number * self.block_size + block_offset slot_mapping[-1].append(slot) + if any(input_mrope_positions): + input_positions = None # type: ignore + else: + input_mrope_positions = None # type: ignore + max_query_len = max(query_lens) real_num_seqs = len(query_lens) @@ -1110,7 +1159,7 @@ def _prepare_prompt( dtype=torch.long, device='cpu') - input_positions = make_tensor_with_pad(input_positions, + input_positions = make_tensor_with_pad(input_positions or input_mrope_positions, max_len=max_prompt_len, pad=0, dtype=torch.long, @@ -1196,6 +1245,7 @@ def _prepare_decode( ) -> PrepareDecodeMetadata: input_tokens: List[List[int]] = [] input_positions: List[List[int]] = [] + input_mrope_positions: List[List[int]] = [[] for _ in range(3)] slot_mapping: List[List[int]] = [] seq_lens: List[int] = [] encoder_seq_lens: List[int] = [] @@ -1241,7 +1291,17 @@ def _prepare_decode( seq_len = seq_data.get_len() position = seq_len - 1 - input_positions.append([position]) + if seq_data.mrope_position_delta is not None: + context_len = seq_data.get_num_computed_tokens() + next_pos = MRotaryEmbedding.get_next_input_positions( + seq_data.mrope_position_delta, + context_len, + seq_len, + ) + for idx in range(3): + input_mrope_positions[idx].extend(next_pos[idx]) + else: + input_positions.append(position) seq_len = seq_len if self.sliding_window is None else min( seq_len, self.sliding_window) @@ -1264,6 +1324,20 @@ def _prepare_decode( lora_index_mapping.append(lora_id) lora_prompt_mapping.append(lora_id) + #sasarkar this bit isnt there in the latest cpu code. maybe subsumed by: + ''' + input_positions = torch.tensor( + input_data.input_positions + if not any(input_data.input_mrope_positions) else + input_data.input_mrope_positions, + dtype=torch.long, + device="cpu") + ''' + if any(input_mrope_positions): + input_positions = None # type: ignore + else: + input_mrope_positions = None # type: ignore + if self.sliding_window is not None: sliding_window_blocks = (self.sliding_window // self.block_size) @@ -1278,7 +1352,7 @@ def _prepare_decode( real_batch_size = len(seq_group_metadata_list) input_tokens = output[:real_batch_size].clone() - input_positions = torch.tensor(input_positions, + input_positions = torch.tensor(input_positions or input_mrope_positions, dtype=torch.long, device='cpu') From a3f884b3b68cedd33b98260d60c90068e110694b Mon Sep 17 00:00:00 2001 From: Sayantan Sarkar Date: Thu, 6 Feb 2025 09:03:36 +0000 Subject: [PATCH 02/34] Comments to trace execution diff between cpu/hpu --- vllm/worker/hpu_model_runner.py | 38 +++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index ed60f836ed51a..e04cf3c49143c 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1018,6 +1018,21 @@ def _prepare_prompt( # is always the first token in the sequence. input_positions.append(list(range(context_len, seq_len))) + ''' + seq_group_metadata.multi_modal_data is None, so we dont enter this + hence multi_modal_kwargs_list isnt populated + + on cpu its: + seq_group_metadata.multi_modal_data +{'pixel_values': tensor([[-1.1061, -1.1061, -1.1061, ..., -1.4518, -1.4518, -1.4518], + [-1.1207, -1.1207, -1.1207, ..., -1.4376, -1.4376, -1.4376], + [-1.1353, -1.1353, -1.1353, ..., -1.4376, -1.4376, -1.4376], + ..., + [ 1.1128, 0.9668, 0.8792, ..., 0.8945, 1.1221, 1.3496], + [ 0.9230, 1.2004, 1.3902, ..., 0.7950, 0.3542, 0.2973], + [ 0.9814, 0.9376, 1.0836, ..., 1.2643, 1.1789, 1.1363]]), 'image_grid_thw': tensor([[ 1, 62, 92]])} + + ''' if seq_group_metadata.multi_modal_data: positions = input_positions[0] mm_data, placeholder_maps = MultiModalPlaceholderMap \ @@ -1586,11 +1601,10 @@ def prepare_input_tensors( decode_slot_mapping, decode_lora_ids, ) = self._prepare_decode(decode_reqs) - - if not self.is_pooler: - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, seq_lens, query_lens, self.device, - self.pin_memory) + sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, + seq_lens, query_lens, + self.device, + self.pin_memory) if not self.scheduler_config.chunked_prefill_enabled: assert (len(prefill_reqs) and len(decode_reqs)) == 0 @@ -1774,12 +1788,18 @@ def create_dummy_seq_group_metadata(self, prompt_token_ids_array = array('l', prompt_token_ids) # noqa: F821 seq_data = SequenceData(prompt_token_ids_array) seq_data.output_token_ids = output_token_ids - return SequenceGroupMetadata(request_id=str(group_id), + x = SequenceGroupMetadata(request_id=str(group_id), is_prompt=(output_len == 0), seq_data={group_id: seq_data}, sampling_params=sampling_params, block_tables=block_tables, lora_request=lora_request) + ''' + x.multi_modal_data is empty.... + we need to pass in some dummy here. + I think llama3.2VL is working, how is it working if this is empty?.. need to track llama3.2vl status + ''' + return x def profile_run(self) -> None: num_layers = self.model_config.get_num_layers(self.parallel_config) @@ -1864,6 +1884,9 @@ def warmup_scenario(self, profiler.start() for _ in range(times): inputs = self.prepare_model_input(seqs) + ''' + sasarkar: at this point inputs.multi_modal_kwargs.keys() is empty.. thats not good + ''' is_single_step = \ self.vllm_config.scheduler_config.num_scheduler_steps == 1 if is_prompt or is_single_step: @@ -2328,6 +2351,9 @@ def prepare_model_input( seq_group_metadata_list=seq_group_metadata_list) model_input, sampling_metadata = self.prepare_input_tensors( seq_group_metadata_list) + ''' + sasarkar: model_input.multi_modal_kwargs empty here.. not good + ''' assert model_input.attn_metadata is not None is_prompt = model_input.attn_metadata.is_prompt From c83c882dfd6daf771c5dbcd5d40be09e6ffb77d0 Mon Sep 17 00:00:00 2001 From: Sayantan Sarkar Date: Fri, 7 Feb 2025 07:20:02 +0000 Subject: [PATCH 03/34] minor --- vllm/worker/hpu_model_runner.py | 51 ++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 10 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index e04cf3c49143c..4915fc45110c2 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -767,6 +767,7 @@ def model_is_mrope(self) -> bool: """Detect if the model has "mrope" rope_scaling type. mrope requires keep "rope_deltas" between prompt and decoding phases.""" rope_scaling = getattr(self.model_config.hf_config, "rope_scaling", {}) + breakpoint() if rope_scaling is None: return False return rope_scaling.get("type", None) == "mrope" @@ -965,6 +966,7 @@ def _prepare_prompt( if len(seq_group_metadata_list) == 0: return PreparePromptMetadata.empty() + #breakpoint() for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt seq_ids = list(seq_group_metadata.seq_data.keys()) @@ -1035,6 +1037,7 @@ def _prepare_prompt( ''' if seq_group_metadata.multi_modal_data: positions = input_positions[0] + #breakpoint() mm_data, placeholder_maps = MultiModalPlaceholderMap \ .from_seq_group(seq_group_metadata, range(positions[0], positions[0] + len(positions))) @@ -1048,14 +1051,16 @@ def _prepare_prompt( ) mrope_positions = None - if self.runner.model_is_mrope: + #breakpoint() + if self.model_config.uses_mrope: + #if self.model_is_mrope: # this returns false... rope_scaling.get("type", None) == "mrope" fails as it is "default" image_grid_thw = mm_kwargs.get("image_grid_thw", None) video_grid_thw = mm_kwargs.get("video_grid_thw", None) assert image_grid_thw is not None or video_grid_thw is not None, ( "mrope embedding type requires multi-modal input mapper " "returns 'image_grid_thw' or 'video_grid_thw'.") - hf_config = self.runner.model_config.hf_config + hf_config = self.model_config.hf_config token_ids = seq_data.get_token_ids() mrope_positions, mrope_position_delta = \ MRotaryEmbedding.get_input_positions( @@ -1068,14 +1073,18 @@ def _prepare_prompt( vision_end_token_id=hf_config.vision_end_token_id, spatial_merge_size=hf_config.vision_config. spatial_merge_size, - context_len=computed_len, + context_len=context_len, ) seq_data.mrope_position_delta = mrope_position_delta + #breakpoint() + ''' + Hpu: mrope_positions 3x1024 .. 32 of the outer loop, so 1024*32 ... + ''' if mrope_positions: for idx in range(3): input_mrope_positions[idx].extend(mrope_positions[idx]) else: - input_positions.extend(list(range(computed_len, seq_len))) + input_positions.extend(list(range(context_len, seq_len))) multi_modal_kwargs_list.append(mm_kwargs) @@ -1174,6 +1183,10 @@ def _prepare_prompt( dtype=torch.long, device='cpu') + #breakpoint() + #input_mrope_positions : list: 3x32768 + # max_prompt_len: max_prompt_len + # in CPU this is: torch.Size([3, 1451]) input_positions = make_tensor_with_pad(input_positions or input_mrope_positions, max_len=max_prompt_len, pad=0, @@ -1788,12 +1801,27 @@ def create_dummy_seq_group_metadata(self, prompt_token_ids_array = array('l', prompt_token_ids) # noqa: F821 seq_data = SequenceData(prompt_token_ids_array) seq_data.output_token_ids = output_token_ids - x = SequenceGroupMetadata(request_id=str(group_id), - is_prompt=(output_len == 0), - seq_data={group_id: seq_data}, - sampling_params=sampling_params, - block_tables=block_tables, - lora_request=lora_request) + # sasarkar, unify if-else later + if self.model_config.uses_mrope: + # sasarkar: hard coded img shape. what should it be in general? + multi_modal_data_dummy = MultiModalKwargs({'pixel_values': torch.rand([5704, 1176]), 'image_grid_thw': torch.tensor([[ 1, 62, 92]])}) + x = SequenceGroupMetadata(request_id=str(group_id), + is_prompt=(output_len == 0), + seq_data={group_id: seq_data}, + sampling_params=sampling_params, + block_tables=block_tables, + lora_request=lora_request, + multi_modal_data=multi_modal_data_dummy, + mm_processor_kwargs={}, + multi_modal_placeholders={'image': [{'offset': 15, 'length': 1426}]}) # sasarkar.. remove hardcoded nums + else: + x = SequenceGroupMetadata(request_id=str(group_id), + is_prompt=(output_len == 0), + seq_data={group_id: seq_data}, + sampling_params=sampling_params, + block_tables=block_tables, + lora_request=lora_request) + #breakpoint() ''' x.multi_modal_data is empty.... we need to pass in some dummy here. @@ -1835,6 +1863,7 @@ def warmup_scenario(self, # passed in, which contains a lora from the lora warmup path. dummy_lora_requests: List[LoRARequest] = [] dummy_lora_requests_per_seq: List[LoRARequest] = [] + #breakpoint() if self.lora_config and is_lora_profile_run: assert self.lora_manager is not None with self.lora_manager.dummy_lora_cache(): @@ -1854,6 +1883,7 @@ def warmup_scenario(self, ] self.profiler.start('internal', scenario_name) times = 3 if use_graphs or is_pt_profiler_run else 1 + #breakpoint() if is_prompt: seqs = [ self.create_dummy_seq_group_metadata( @@ -1887,6 +1917,7 @@ def warmup_scenario(self, ''' sasarkar: at this point inputs.multi_modal_kwargs.keys() is empty.. thats not good ''' + #breakpoint() is_single_step = \ self.vllm_config.scheduler_config.num_scheduler_steps == 1 if is_prompt or is_single_step: From c5f65f936dd1f4e5bba0f01a4e99322a2749c0d4 Mon Sep 17 00:00:00 2001 From: Sayantan Sarkar Date: Fri, 7 Feb 2025 08:41:04 +0000 Subject: [PATCH 04/34] _validate_and_reshape_mm_tensor looks buggy... bypassing it with alternative pt code else it was editing image_grid_thw to 0,0,0 etc --- vllm/model_executor/models/qwen2_vl.py | 30 ++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 961f53cef1379..a457fbeec6faf 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -595,6 +595,7 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) pos_ids = torch.cat(pos_ids, dim=0) max_grid_size = grid_thw[:, 1:].max() + breakpoint() rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) return rotary_pos_emb @@ -1112,6 +1113,14 @@ def _validate_and_reshape_mm_tensor(self, mm_input: object, raise ValueError(f"{name} should be 2D or batched 3D tensor. " f"Got ndim: {mm_input.ndim} " f"(shape={mm_input.shape})") + # sasarkar buggy + ''' + (Pdb) (list(mm_input)) +[tensor([[ 1, 62, 92]], device='hpu:0')] +(Pdb) torch.concat(list(mm_input)) +tensor([[0, 0, 0]], device='hpu:0') + + ''' return torch.concat(list(mm_input)) else: return torch.concat(mm_input) @@ -1126,10 +1135,13 @@ def _parse_and_validate_image_input( return None if pixel_values is not None: - pixel_values = self._validate_and_reshape_mm_tensor( - pixel_values, "image pixel values") - image_grid_thw = self._validate_and_reshape_mm_tensor( - image_grid_thw, "image grid_thw") + # sasarkar: _validate_and_reshape_mm_tensor seems to be messing up the values some how + #pixel_values = self._validate_and_reshape_mm_tensor( + # pixel_values, "image pixel values") + #image_grid_thw = self._validate_and_reshape_mm_tensor( + # image_grid_thw, "image grid_thw") + pixel_values = pixel_values.view(-1, pixel_values.shape[-1]) + image_grid_thw = image_grid_thw.view(-1, image_grid_thw.shape[-1]) if not isinstance(pixel_values, (torch.Tensor, list)): raise ValueError("Incorrect type of image pixel values. " @@ -1140,6 +1152,7 @@ def _parse_and_validate_image_input( image_grid_thw=image_grid_thw) if image_embeds is not None: + assert False, "Call me if this is hit" image_embeds = self._validate_and_reshape_mm_tensor( image_embeds, "image embeds") image_grid_thw = self._validate_and_reshape_mm_tensor( @@ -1357,6 +1370,15 @@ def forward( video_input=video_input) input_ids = None + ''' + During "warmup": ... have switched off warmup memory for now + hpu: + input_ids, positions: 32x1024 + + CPU: + input_ids is None + positions: 3x1451: + ''' hidden_states = self.language_model.model( input_ids=input_ids, positions=positions, From fca160d28f59ddb7b4259860e54206c461479753 Mon Sep 17 00:00:00 2001 From: Sayantan Sarkar Date: Fri, 7 Feb 2025 08:57:19 +0000 Subject: [PATCH 05/34] Some comments regd buggy hpu graphs running if we use enforce_eager: llm = LLM(model="Qwen/Qwen2-VL-7B-Instruct", enforce_eager=True) --- vllm/model_executor/models/qwen2_vl.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index a457fbeec6faf..786ab2177e59d 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -580,7 +580,7 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) hpos_ids = hpos_ids.reshape( - h // self.spatial_merge_size, + h // self.spatial_merge_size, # buggy... 62/2 is yielding 0 .. seems its ok wo hpu graphs self.spatial_merge_size, w // self.spatial_merge_size, self.spatial_merge_size, @@ -595,7 +595,6 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) pos_ids = torch.cat(pos_ids, dim=0) max_grid_size = grid_thw[:, 1:].max() - breakpoint() rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) return rotary_pos_emb @@ -1113,7 +1112,7 @@ def _validate_and_reshape_mm_tensor(self, mm_input: object, raise ValueError(f"{name} should be 2D or batched 3D tensor. " f"Got ndim: {mm_input.ndim} " f"(shape={mm_input.shape})") - # sasarkar buggy + # sasarkar buggy ... seems an issue with hpu graph? ''' (Pdb) (list(mm_input)) [tensor([[ 1, 62, 92]], device='hpu:0')] From 095dbbd916146186bbfacb84dfebc39b7161f3ae Mon Sep 17 00:00:00 2001 From: Sayantan Sarkar Date: Fri, 7 Feb 2025 09:09:14 +0000 Subject: [PATCH 06/34] Return early to prevent mem profiling --- vllm/model_executor/layers/rotary_embedding.py | 2 +- vllm/worker/hpu_model_runner.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 49d6b0ac13134..9f0e770c40be1 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -849,7 +849,7 @@ def forward( dim=-1) query_shape = query.shape - breakpoint() + #breakpoint() ''' in CPU: diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 4915fc45110c2..4dc4c0d3281a2 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1830,6 +1830,7 @@ def create_dummy_seq_group_metadata(self, return x def profile_run(self) -> None: + return num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers bind_kv_cache( From f557d9968e76ab5ea4eaec7013c79447f4702d27 Mon Sep 17 00:00:00 2001 From: pallavi jaini Date: Fri, 7 Feb 2025 19:16:26 +0000 Subject: [PATCH 07/34] Initial commit for the Qwen 2.5 VL --- vllm/model_executor/models/qwen2_5_vl.py | 2 +- vllm/model_executor/models/registry.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 6aec99b3f9641..e1fd63ffeb4a7 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1121,4 +1121,4 @@ def get_mm_mapping(self) -> MultiModelKeys: return MultiModelKeys.from_string_field( language_model="language_model", connector="visual.", - tower_model="visual.merger.") + tower_model="visual.merger.") \ No newline at end of file diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 7260d973bfb28..f04867070c479 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -135,6 +135,7 @@ "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 + "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # [Auto-converted (see adapters.py)] "Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForCausalLM"), # Technically PrithviGeoSpatialMAE is a model that works on images, both in From 8c7a2b3d2947d6174f9597465239c7d2201b700d Mon Sep 17 00:00:00 2001 From: Sayantan Sarkar Date: Sun, 9 Feb 2025 05:16:38 +0000 Subject: [PATCH 08/34] workaround to make HPU graphs work. disable_tensor_cache set to false. --- test_multimodal.py | 34 +++++++++++++++++++++++++++++++++ vllm/worker/hpu_model_runner.py | 4 +++- 2 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 test_multimodal.py diff --git a/test_multimodal.py b/test_multimodal.py new file mode 100644 index 0000000000000..f216e06d2ed2f --- /dev/null +++ b/test_multimodal.py @@ -0,0 +1,34 @@ +from vllm import LLM +from vllm import SamplingParams +from vllm.assets.image import ImageAsset +import PIL +import multiprocessing + +def main(): + # Load the image + image = ImageAsset("stop_sign").pil_image + + sampling_params = SamplingParams(temperature=0.8, top_p=0.95 ) + # Initialize the LLM with a multimodal model like LLaVA + # llava-hf/llava-1.5-7b-hf + # Qwen/Qwen2-VL-7B-Instruct + # meta-llama/Llama-3.2-11B-Vision-Instruct -> /root/sasarkar/clean_model_garden/models--meta-llama--Llama-3.2-11B-Vision-Instruct + llm = LLM(model="Qwen/Qwen2-VL-7B-Instruct", enforce_eager=False) + #llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", enforce_eager=True) + #llm = LLM(model="llava-hf/llava-1.5-7b-hf") + #llm = LLM(model="/root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-11B-Vision-Instruct/snapshots/9eb2daaa8597bf192a8b0e73f848f3a102794df5/", tensor_parallel_size=2,) + # Create the prompt with image data + # llava prompt + #prompt = "USER: \nWhat is the content of this image?\nASSISTANT: <|image|>" + #prompt = "" * 576 + ("\nUSER: What is the content of this image?\nASSISTANT:") + # qwen2-vl prompt + prompt = '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n' + outputs = llm.generate({"prompt": prompt, "multi_modal_data": {"image": image}}) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + +if __name__ == "__main__": + multiprocessing.freeze_support() + main() \ No newline at end of file diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 4dc4c0d3281a2..94e8e0c79047d 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -890,8 +890,10 @@ def _add_dummy_seq(self, seq_group_metadata_list, is_prompt): return seq_group_metadata_list, real_batch_size, batch_size_padded def _maybe_wrap_in_hpu_graph(self, *args, **kwargs): + import os + workaround = os.environ.get('WORKAROUND', '0') == '1' # there is also a flag provided for disabletensorcache return htorch.hpu.wrap_in_hpu_graph( - HpuModelAdapter(*args, **kwargs), disable_tensor_cache=True + HpuModelAdapter(*args, **kwargs), disable_tensor_cache=not workaround # orig code its set to True ) if htorch.utils.internal.is_lazy() else HpuModelAdapter( *args, **kwargs) From 22bc3ef8900a0aff7d6906535911bd2d44e6a598 Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Mon, 10 Feb 2025 19:25:29 +0000 Subject: [PATCH 09/34] adding qwen2.5-vl to hpu + small cleanups --- .../model_executor/layers/rotary_embedding.py | 20 ------------------ vllm/worker/hpu_model_runner.py | 21 ++++++------------- 2 files changed, 6 insertions(+), 35 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 9f0e770c40be1..6ebab8927a92b 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -849,26 +849,6 @@ def forward( dim=-1) query_shape = query.shape - #breakpoint() - ''' - in CPU: - - query.shape - torch.Size([1451, 3584]) - - num_tokens - 1451 - (Pdb) self.head_size - 128 - - on HPU: - query.shape - torch.Size([32, 1024, 3584]) - (Pdb) num_tokens - 1024 - (Pdb) self.head_size - 128 - ''' query = query.view(num_tokens, -1, self.head_size) query_rot = query[..., :self.rotary_dim] query_pass = query[..., self.rotary_dim:] diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 94e8e0c79047d..d43c96671b406 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -58,6 +58,7 @@ from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors, Logprob, SequenceData, SequenceGroupMetadata, SequenceOutput) +from vllm.transformers_utils.config import uses_mrope from vllm.utils import (bind_kv_cache, is_fake_hpu, is_pin_memory_available, make_tensor_with_pad) from vllm.worker.model_runner_base import ( @@ -764,13 +765,8 @@ def _set_gc_threshold(self) -> None: @property def model_is_mrope(self) -> bool: - """Detect if the model has "mrope" rope_scaling type. - mrope requires keep "rope_deltas" between prompt and decoding phases.""" - rope_scaling = getattr(self.model_config.hf_config, "rope_scaling", {}) - breakpoint() - if rope_scaling is None: - return False - return rope_scaling.get("type", None) == "mrope" + config = self.model_config.hf_config + return uses_mrope(config) def load_model(self) -> None: import habana_frameworks.torch.core as htcore @@ -1053,11 +1049,10 @@ def _prepare_prompt( ) mrope_positions = None - #breakpoint() if self.model_config.uses_mrope: - #if self.model_is_mrope: # this returns false... rope_scaling.get("type", None) == "mrope" fails as it is "default" image_grid_thw = mm_kwargs.get("image_grid_thw", None) video_grid_thw = mm_kwargs.get("video_grid_thw", None) + second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) assert image_grid_thw is not None or video_grid_thw is not None, ( "mrope embedding type requires multi-modal input mapper " "returns 'image_grid_thw' or 'video_grid_thw'.") @@ -1067,14 +1062,10 @@ def _prepare_prompt( mrope_positions, mrope_position_delta = \ MRotaryEmbedding.get_input_positions( token_ids, + hf_config=hf_config, image_grid_thw=image_grid_thw, video_grid_thw=video_grid_thw, - image_token_id=hf_config.image_token_id, - video_token_id=hf_config.video_token_id, - vision_start_token_id=hf_config.vision_start_token_id, - vision_end_token_id=hf_config.vision_end_token_id, - spatial_merge_size=hf_config.vision_config. - spatial_merge_size, + second_per_grid_ts=second_per_grid_ts, context_len=context_len, ) seq_data.mrope_position_delta = mrope_position_delta From d4a721c749c119f96463b03d7affb12b3776e0ec Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Mon, 10 Feb 2025 20:20:34 +0000 Subject: [PATCH 10/34] removing duplicates CPU --- vllm/model_executor/models/qwen2_5_vl.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index e1fd63ffeb4a7..10d1ef69b82de 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -612,11 +612,18 @@ def forward( # windows attention window_index, cu_window_seqlens = self.get_window_index(grid_thw) + def remove_duplicates_cpu(a): + return [ + a[i] for i in range(len(a)) if i==0 or a[i-1]!= a[i] + ] + cu_window_seqlens = remove_duplicates_cpu(cu_window_seqlens) cu_window_seqlens = torch.tensor( cu_window_seqlens, device=hidden_states.device, dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) - cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) + # This is not a static operation, removing duplicates earlier on CPU + #cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) + seq_len, _ = hidden_states.size() hidden_states = hidden_states.reshape( seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) From 5474d9bb89909b6df05b3ad0ec05d8fe703fe2df Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Thu, 13 Feb 2025 19:01:15 +0000 Subject: [PATCH 11/34] small changes to work with llama-3.2-vl --- vllm/worker/hpu_model_runner.py | 103 ++++++-------------------------- 1 file changed, 18 insertions(+), 85 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index d43c96671b406..cbd08675ade52 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -21,7 +21,6 @@ import habana_frameworks.torch as htorch import habana_frameworks.torch.internal.bridge_config as bc import torch -import torch.nn as nn import vllm_hpu_extension.environment as environment from vllm_hpu_extension.bucketing import HPUBucketingContext from vllm_hpu_extension.flags import enabled_flags @@ -893,7 +892,7 @@ def _maybe_wrap_in_hpu_graph(self, *args, **kwargs): ) if htorch.utils.internal.is_lazy() else HpuModelAdapter( *args, **kwargs) - def get_model(self) -> nn.Module: + def get_model(self) -> torch.nn.Module: if isinstance(self.model, HpuModelAdapter): return self.model.model return self.model @@ -964,7 +963,6 @@ def _prepare_prompt( if len(seq_group_metadata_list) == 0: return PreparePromptMetadata.empty() - #breakpoint() for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt seq_ids = list(seq_group_metadata.seq_data.keys()) @@ -1018,24 +1016,8 @@ def _prepare_prompt( # is always the first token in the sequence. input_positions.append(list(range(context_len, seq_len))) - ''' - seq_group_metadata.multi_modal_data is None, so we dont enter this - hence multi_modal_kwargs_list isnt populated - - on cpu its: - seq_group_metadata.multi_modal_data -{'pixel_values': tensor([[-1.1061, -1.1061, -1.1061, ..., -1.4518, -1.4518, -1.4518], - [-1.1207, -1.1207, -1.1207, ..., -1.4376, -1.4376, -1.4376], - [-1.1353, -1.1353, -1.1353, ..., -1.4376, -1.4376, -1.4376], - ..., - [ 1.1128, 0.9668, 0.8792, ..., 0.8945, 1.1221, 1.3496], - [ 0.9230, 1.2004, 1.3902, ..., 0.7950, 0.3542, 0.2973], - [ 0.9814, 0.9376, 1.0836, ..., 1.2643, 1.1789, 1.1363]]), 'image_grid_thw': tensor([[ 1, 62, 92]])} - - ''' if seq_group_metadata.multi_modal_data: positions = input_positions[0] - #breakpoint() mm_data, placeholder_maps = MultiModalPlaceholderMap \ .from_seq_group(seq_group_metadata, range(positions[0], positions[0] + len(positions))) @@ -1049,7 +1031,7 @@ def _prepare_prompt( ) mrope_positions = None - if self.model_config.uses_mrope: + if self.model_is_mrope: image_grid_thw = mm_kwargs.get("image_grid_thw", None) video_grid_thw = mm_kwargs.get("video_grid_thw", None) second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) @@ -1069,15 +1051,9 @@ def _prepare_prompt( context_len=context_len, ) seq_data.mrope_position_delta = mrope_position_delta - #breakpoint() - ''' - Hpu: mrope_positions 3x1024 .. 32 of the outer loop, so 1024*32 ... - ''' - if mrope_positions: - for idx in range(3): - input_mrope_positions[idx].extend(mrope_positions[idx]) - else: - input_positions.extend(list(range(context_len, seq_len))) + if mrope_positions: + for idx in range(3): + input_mrope_positions[idx].extend(mrope_positions[idx]) multi_modal_kwargs_list.append(mm_kwargs) @@ -1176,10 +1152,6 @@ def _prepare_prompt( dtype=torch.long, device='cpu') - #breakpoint() - #input_mrope_positions : list: 3x32768 - # max_prompt_len: max_prompt_len - # in CPU this is: torch.Size([3, 1451]) input_positions = make_tensor_with_pad(input_positions or input_mrope_positions, max_len=max_prompt_len, pad=0, @@ -1322,7 +1294,7 @@ def _prepare_decode( for idx in range(3): input_mrope_positions[idx].extend(next_pos[idx]) else: - input_positions.append(position) + input_positions.append([position]) seq_len = seq_len if self.sliding_window is None else min( seq_len, self.sliding_window) @@ -1345,20 +1317,6 @@ def _prepare_decode( lora_index_mapping.append(lora_id) lora_prompt_mapping.append(lora_id) - #sasarkar this bit isnt there in the latest cpu code. maybe subsumed by: - ''' - input_positions = torch.tensor( - input_data.input_positions - if not any(input_data.input_mrope_positions) else - input_data.input_mrope_positions, - dtype=torch.long, - device="cpu") - ''' - if any(input_mrope_positions): - input_positions = None # type: ignore - else: - input_mrope_positions = None # type: ignore - if self.sliding_window is not None: sliding_window_blocks = (self.sliding_window // self.block_size) @@ -1373,6 +1331,12 @@ def _prepare_decode( real_batch_size = len(seq_group_metadata_list) input_tokens = output[:real_batch_size].clone() + + if any(input_mrope_positions): + input_positions = None # type: ignore + else: + input_mrope_positions = None # type: ignore + input_positions = torch.tensor(input_positions or input_mrope_positions, dtype=torch.long, device='cpu') @@ -1794,36 +1758,14 @@ def create_dummy_seq_group_metadata(self, prompt_token_ids_array = array('l', prompt_token_ids) # noqa: F821 seq_data = SequenceData(prompt_token_ids_array) seq_data.output_token_ids = output_token_ids - # sasarkar, unify if-else later - if self.model_config.uses_mrope: - # sasarkar: hard coded img shape. what should it be in general? - multi_modal_data_dummy = MultiModalKwargs({'pixel_values': torch.rand([5704, 1176]), 'image_grid_thw': torch.tensor([[ 1, 62, 92]])}) - x = SequenceGroupMetadata(request_id=str(group_id), - is_prompt=(output_len == 0), - seq_data={group_id: seq_data}, - sampling_params=sampling_params, - block_tables=block_tables, - lora_request=lora_request, - multi_modal_data=multi_modal_data_dummy, - mm_processor_kwargs={}, - multi_modal_placeholders={'image': [{'offset': 15, 'length': 1426}]}) # sasarkar.. remove hardcoded nums - else: - x = SequenceGroupMetadata(request_id=str(group_id), - is_prompt=(output_len == 0), - seq_data={group_id: seq_data}, - sampling_params=sampling_params, - block_tables=block_tables, - lora_request=lora_request) - #breakpoint() - ''' - x.multi_modal_data is empty.... - we need to pass in some dummy here. - I think llama3.2VL is working, how is it working if this is empty?.. need to track llama3.2vl status - ''' - return x + return SequenceGroupMetadata(request_id=str(group_id), + is_prompt=(output_len == 0), + seq_data={group_id: seq_data}, + sampling_params=sampling_params, + block_tables=block_tables, + lora_request=lora_request) def profile_run(self) -> None: - return num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers bind_kv_cache( @@ -1857,7 +1799,6 @@ def warmup_scenario(self, # passed in, which contains a lora from the lora warmup path. dummy_lora_requests: List[LoRARequest] = [] dummy_lora_requests_per_seq: List[LoRARequest] = [] - #breakpoint() if self.lora_config and is_lora_profile_run: assert self.lora_manager is not None with self.lora_manager.dummy_lora_cache(): @@ -1877,7 +1818,6 @@ def warmup_scenario(self, ] self.profiler.start('internal', scenario_name) times = 3 if use_graphs or is_pt_profiler_run else 1 - #breakpoint() if is_prompt: seqs = [ self.create_dummy_seq_group_metadata( @@ -1908,10 +1848,6 @@ def warmup_scenario(self, profiler.start() for _ in range(times): inputs = self.prepare_model_input(seqs) - ''' - sasarkar: at this point inputs.multi_modal_kwargs.keys() is empty.. thats not good - ''' - #breakpoint() is_single_step = \ self.vllm_config.scheduler_config.num_scheduler_steps == 1 if is_prompt or is_single_step: @@ -2376,9 +2312,6 @@ def prepare_model_input( seq_group_metadata_list=seq_group_metadata_list) model_input, sampling_metadata = self.prepare_input_tensors( seq_group_metadata_list) - ''' - sasarkar: model_input.multi_modal_kwargs empty here.. not good - ''' assert model_input.attn_metadata is not None is_prompt = model_input.attn_metadata.is_prompt From 008fbb53ad1b1b187e7d23eff7ebc58595b76716 Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Sun, 16 Feb 2025 02:18:09 +0000 Subject: [PATCH 12/34] skip profile_run for now --- vllm/worker/hpu_model_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index cbd08675ade52..8c2a1b0804f3c 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1766,6 +1766,7 @@ def create_dummy_seq_group_metadata(self, lora_request=lora_request) def profile_run(self) -> None: + return num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers bind_kv_cache( From f48d6fc84f8a02d0d457e95c8c4285ed7a4d6d17 Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Tue, 18 Feb 2025 20:18:38 +0000 Subject: [PATCH 13/34] reshape positions in MRotaryEmbedding for HPU --- vllm/model_executor/layers/rotary_embedding.py | 5 +++++ vllm/worker/hpu_model_runner.py | 1 - 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 6ebab8927a92b..1468c8ec30182 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -830,6 +830,11 @@ def forward( key: [num_tokens, num_kv_heads * head_size] """ assert positions.ndim == 1 or positions.ndim == 2 + if positions.ndim == 2 and positions.shape[0] != 3: + # HPU positions are [batch_size, num_tokens] + # if they are not [3, num_tokens], we will + # reshape it to be [num_tokens, ] + positions = positions.view(-1) num_tokens = positions.shape[-1] cos_sin = self.cos_sin_cache[positions] diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 8c2a1b0804f3c..cbd08675ade52 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1766,7 +1766,6 @@ def create_dummy_seq_group_metadata(self, lora_request=lora_request) def profile_run(self) -> None: - return num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers bind_kv_cache( From 4caf3834fde89da3843f720b16215b72108cb2e3 Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Fri, 21 Feb 2025 09:31:58 +0000 Subject: [PATCH 14/34] input positions [3, seq_len] or [seq_len,] for Qwen2.5vl --- .../model_executor/layers/rotary_embedding.py | 7 +- vllm/worker/hpu_model_runner.py | 114 ++++++++++++------ 2 files changed, 78 insertions(+), 43 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 1468c8ec30182..d374d880efa09 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -830,12 +830,7 @@ def forward( key: [num_tokens, num_kv_heads * head_size] """ assert positions.ndim == 1 or positions.ndim == 2 - if positions.ndim == 2 and positions.shape[0] != 3: - # HPU positions are [batch_size, num_tokens] - # if they are not [3, num_tokens], we will - # reshape it to be [num_tokens, ] - positions = positions.view(-1) - + # print(f"positions {positions.shape} query {query.shape} key {key.shape}") num_tokens = positions.shape[-1] cos_sin = self.cos_sin_cache[positions] cos, sin = cos_sin.chunk(2, dim=-1) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index cbd08675ade52..c0ba9243fc088 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -422,11 +422,9 @@ def _prepare_cos_sin(self, positions): current_module.prepare_cos_sin( positions, recompute_cos_sin=self.recompute_cos_sin) else: - pass - # dont raise error for qwen2.5-vl - #raise AttributeError( - # "The module at the end of the path does not have \ - # a 'prepare_cos_sin' method.") + raise AttributeError( + "The module at the end of the path does not have \ + a 'prepare_cos_sin' method.") def forward(self, *args, **kwargs): kwargs = kwargs.copy() @@ -440,9 +438,10 @@ def forward(self, *args, **kwargs): kwargs['attn_metadata'] = self._update_metadata( kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), input_ids.device, self.dtype) - if 'lora_mask' in kwargs: - LoraMask.setLoraMask(kwargs.pop('lora_mask')) - if self.layer_names is not None: + LoraMask.setLoraMask(kwargs.pop('lora_mask')) + model_config = getattr(self.model, "config", None) + model_is_mrope = uses_mrope(model_config) + if self.layer_names is not None and not model_is_mrope: self._prepare_cos_sin(kwargs['positions']) with set_forward_context(kwargs['attn_metadata'], self.vllm_config, @@ -945,7 +944,7 @@ def _prepare_prompt( ) -> PreparePromptMetadata: input_tokens: List[List[int]] = [] input_positions: List[List[int]] = [] - input_mrope_positions: List[List[int]] = [[] for _ in range(3)] + input_mrope_positions: List[List[int]] = [] slot_mapping: List[List[int]] = [] lora_index_mapping: List[List[int]] = [] lora_prompt_mapping: List[List[int]] = [] @@ -1016,6 +1015,7 @@ def _prepare_prompt( # is always the first token in the sequence. input_positions.append(list(range(context_len, seq_len))) + batch_input_mrope_positions = None if seq_group_metadata.multi_modal_data: positions = input_positions[0] mm_data, placeholder_maps = MultiModalPlaceholderMap \ @@ -1030,7 +1030,6 @@ def _prepare_prompt( seq_group_metadata.mm_processor_kwargs, ) - mrope_positions = None if self.model_is_mrope: image_grid_thw = mm_kwargs.get("image_grid_thw", None) video_grid_thw = mm_kwargs.get("video_grid_thw", None) @@ -1050,11 +1049,11 @@ def _prepare_prompt( second_per_grid_ts=second_per_grid_ts, context_len=context_len, ) + assert mrope_positions is not None seq_data.mrope_position_delta = mrope_position_delta - if mrope_positions: - for idx in range(3): - input_mrope_positions[idx].extend(mrope_positions[idx]) - + batch_input_mrope_positions = [[] for _ in range(3)] + for idx in range(3): + batch_input_mrope_positions[idx].extend(mrope_positions[idx]) multi_modal_kwargs_list.append(mm_kwargs) @@ -1062,6 +1061,8 @@ def _prepare_prompt( multi_modal_placeholder_maps[modality].extend( placeholder_map) + input_mrope_positions.append(batch_input_mrope_positions) + if seq_group_metadata.block_tables is None: # During memory profiling, the block tables are not initialized # yet. In this case, we just use a dummy slot mapping. @@ -1095,11 +1096,6 @@ def _prepare_prompt( slot = block_number * self.block_size + block_offset slot_mapping[-1].append(slot) - if any(input_mrope_positions): - input_positions = None # type: ignore - else: - input_mrope_positions = None # type: ignore - max_query_len = max(query_lens) real_num_seqs = len(query_lens) @@ -1152,11 +1148,40 @@ def _prepare_prompt( dtype=torch.long, device='cpu') - input_positions = make_tensor_with_pad(input_positions or input_mrope_positions, - max_len=max_prompt_len, - pad=0, - dtype=torch.long, - device='cpu') + mrope_input_positions: Optional[List[List[int]]] = None + if any(mrope_position is not None + for mrope_position in input_mrope_positions): + assert self.model_is_mrope + mrope_input_positions = [[] for _ in range(3)] + for idx in range(3): + for b_idx, input_mrope_position in enumerate(input_mrope_positions): + if input_mrope_position is None: + positions = input_positions[b_idx] + else: + positions = input_mrope_position[idx] + # print(f"positions {len(positions)}") + padded_positions = make_tensor_with_pad([positions], + max_len=max_prompt_len, + pad=0, + dtype=torch.long, + device='cpu').flatten().tolist() + mrope_input_positions[idx].extend(padded_positions) + input_positions = None # type: ignore + input_positions_tensor = torch.tensor(mrope_input_positions, + dtype=torch.long, + device='cpu', + ) + else: + input_mrope_positions = None # type: ignore + input_positions_tensor = make_tensor_with_pad(input_positions, + max_len=max_prompt_len, + pad=0, + dtype=torch.long, + device='cpu') + if self.model_is_mrope: + # Qwen 2.5 vl works with flatten input_positions + input_positions_tensor = input_positions_tensor.flatten() + slot_mapping = make_tensor_with_pad(slot_mapping, max_len=max_prompt_len, @@ -1185,7 +1210,7 @@ def _prepare_prompt( self.device, non_blocking=True) input_tokens_tensor = input_tokens_tensor.to( # type: ignore self.device, non_blocking=True) - input_positions = input_positions.to( # type: ignore + input_positions_tensor = input_positions_tensor.to( # type: ignore self.device, non_blocking=True) slot_mapping = slot_mapping.to( # type: ignore self.device, non_blocking=True) @@ -1220,7 +1245,7 @@ def _prepare_prompt( self.device, non_blocking=True) return PreparePromptMetadata(input_tokens=input_tokens_tensor, - input_positions=input_positions, + input_positions=input_positions_tensor, attn_metadata=attn_metadata, seq_lens=seq_lens, query_lens=query_lens, @@ -1284,17 +1309,21 @@ def _prepare_decode( seq_len = seq_data.get_len() position = seq_len - 1 - if seq_data.mrope_position_delta is not None: - context_len = seq_data.get_num_computed_tokens() - next_pos = MRotaryEmbedding.get_next_input_positions( - seq_data.mrope_position_delta, - context_len, - seq_len, - ) - for idx in range(3): - input_mrope_positions[idx].extend(next_pos[idx]) - else: - input_positions.append([position]) + # FIXME: Why do we need to change the decode? + # I didn't find a similar example on the GPU code + # only on the CPU + # + # if seq_data.mrope_position_delta is not None: + # context_len = seq_data.get_num_computed_tokens() + # next_pos = MRotaryEmbedding.get_next_input_positions( + # seq_data.mrope_position_delta, + # context_len, + # seq_len, + # ) + # for idx in range(3): + # input_mrope_positions[idx].extend(next_pos[idx]) + # else: + input_positions.append([position]) seq_len = seq_len if self.sliding_window is None else min( seq_len, self.sliding_window) @@ -1341,6 +1370,10 @@ def _prepare_decode( dtype=torch.long, device='cpu') + if self.model_is_mrope: + # Qwen 2.5 vl works with flatten input_positions + input_positions = input_positions.flatten() + num_decode_tokens = len(seq_lens) last_block_usage = [ @@ -1745,6 +1778,13 @@ def create_dummy_seq_group_metadata(self, sampling_params = SamplingParams(temperature=temperature) num_blocks = math.ceil(seq_len / self.block_size) seq_len = max(seq_len, 1) + # TODO: Add dummy data with metadata info + # encoder_dummy_data \ + # = self.input_registry.dummy_data_for_profiling( + # self.model_config, + # seq_len, + # self.mm_registry, + # is_encoder_data=True) if is_prompt: input_len = seq_len output_len = 0 From 998d0902ff3ede5cd18d580874174d03b45f759a Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Mon, 24 Feb 2025 18:55:01 +0000 Subject: [PATCH 15/34] fix the decoder --- .../model_executor/layers/rotary_embedding.py | 2 +- vllm/model_executor/models/qwen2_5_vl.py | 2 +- vllm/worker/hpu_model_runner.py | 42 +++++++++---------- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index d374d880efa09..2d0d7c22b9fc7 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -830,7 +830,7 @@ def forward( key: [num_tokens, num_kv_heads * head_size] """ assert positions.ndim == 1 or positions.ndim == 2 - # print(f"positions {positions.shape} query {query.shape} key {key.shape}") + print(f" rotary_emd positions {positions.shape} query {query.shape} key {key.shape}") num_tokens = positions.shape[-1] cos_sin = self.cos_sin_cache[positions] cos, sin = cos_sin.chunk(2, dim=-1) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 10d1ef69b82de..669b7e759fa59 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1066,7 +1066,7 @@ def forward( in seconds) for each grid along the temporal dimension in the 3D position IDs. `None` if no videos are passed. """ - + print(f"> qwen2_5_vl.py: input_ids {input_ids.shape} positions {positions.shape}") if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index c0ba9243fc088..555b01ad2befe 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1011,6 +1011,7 @@ def _prepare_prompt( context_lens.append(context_len) query_lens.append(seq_len - context_len) input_tokens.append(prompt_tokens) + print("tokens", prompt_tokens) # NOTE(woosuk): Here we assume that the first token in the prompt # is always the first token in the sequence. input_positions.append(list(range(context_len, seq_len))) @@ -1159,7 +1160,6 @@ def _prepare_prompt( positions = input_positions[b_idx] else: positions = input_mrope_position[idx] - # print(f"positions {len(positions)}") padded_positions = make_tensor_with_pad([positions], max_len=max_prompt_len, pad=0, @@ -1171,6 +1171,7 @@ def _prepare_prompt( dtype=torch.long, device='cpu', ) + print(f" ABC: input positions with MROPE shape is {input_positions_tensor.shape}") else: input_mrope_positions = None # type: ignore input_positions_tensor = make_tensor_with_pad(input_positions, @@ -1181,6 +1182,7 @@ def _prepare_prompt( if self.model_is_mrope: # Qwen 2.5 vl works with flatten input_positions input_positions_tensor = input_positions_tensor.flatten() + print(f" ABC: input positions no mrope shape is {input_positions_tensor.shape}") slot_mapping = make_tensor_with_pad(slot_mapping, @@ -1309,22 +1311,22 @@ def _prepare_decode( seq_len = seq_data.get_len() position = seq_len - 1 - # FIXME: Why do we need to change the decode? - # I didn't find a similar example on the GPU code - # only on the CPU - # - # if seq_data.mrope_position_delta is not None: - # context_len = seq_data.get_num_computed_tokens() - # next_pos = MRotaryEmbedding.get_next_input_positions( - # seq_data.mrope_position_delta, - # context_len, - # seq_len, - # ) - # for idx in range(3): - # input_mrope_positions[idx].extend(next_pos[idx]) - # else: + input_positions.append([position]) + if seq_data.mrope_position_delta is not None: + context_len = seq_data.get_num_computed_tokens() + pos_for_mrope = MRotaryEmbedding.get_next_input_positions( + seq_data.mrope_position_delta, + context_len, + seq_len, + ) + else: + pos_for_mrope = [[position]] * 3 + + for idx in range(3): + input_mrope_positions[idx].extend(pos_for_mrope[idx]) + seq_len = seq_len if self.sliding_window is None else min( seq_len, self.sliding_window) seq_lens.append(seq_len) @@ -1361,19 +1363,15 @@ def _prepare_decode( input_tokens = output[:real_batch_size].clone() - if any(input_mrope_positions): - input_positions = None # type: ignore + if self.model_is_mrope: + input_positions = None else: - input_mrope_positions = None # type: ignore + input_mrope_positions = None input_positions = torch.tensor(input_positions or input_mrope_positions, dtype=torch.long, device='cpu') - if self.model_is_mrope: - # Qwen 2.5 vl works with flatten input_positions - input_positions = input_positions.flatten() - num_decode_tokens = len(seq_lens) last_block_usage = [ From cd1bbe090474569fbf4e2f17da878296637178a8 Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Mon, 24 Feb 2025 18:56:52 +0000 Subject: [PATCH 16/34] comment prints --- vllm/model_executor/layers/rotary_embedding.py | 2 +- vllm/worker/hpu_model_runner.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 2d0d7c22b9fc7..ccecdcfbb1a39 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -830,7 +830,7 @@ def forward( key: [num_tokens, num_kv_heads * head_size] """ assert positions.ndim == 1 or positions.ndim == 2 - print(f" rotary_emd positions {positions.shape} query {query.shape} key {key.shape}") + # print(f" rotary_emd positions {positions.shape} query {query.shape} key {key.shape}") num_tokens = positions.shape[-1] cos_sin = self.cos_sin_cache[positions] cos, sin = cos_sin.chunk(2, dim=-1) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 555b01ad2befe..27924da5ecec1 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1011,7 +1011,7 @@ def _prepare_prompt( context_lens.append(context_len) query_lens.append(seq_len - context_len) input_tokens.append(prompt_tokens) - print("tokens", prompt_tokens) + # print("tokens", prompt_tokens) # NOTE(woosuk): Here we assume that the first token in the prompt # is always the first token in the sequence. input_positions.append(list(range(context_len, seq_len))) @@ -1171,7 +1171,7 @@ def _prepare_prompt( dtype=torch.long, device='cpu', ) - print(f" ABC: input positions with MROPE shape is {input_positions_tensor.shape}") + # print(f" ABC: input positions with MROPE shape is {input_positions_tensor.shape}") else: input_mrope_positions = None # type: ignore input_positions_tensor = make_tensor_with_pad(input_positions, @@ -1182,7 +1182,7 @@ def _prepare_prompt( if self.model_is_mrope: # Qwen 2.5 vl works with flatten input_positions input_positions_tensor = input_positions_tensor.flatten() - print(f" ABC: input positions no mrope shape is {input_positions_tensor.shape}") + # print(f" ABC: input positions no mrope shape is {input_positions_tensor.shape}") slot_mapping = make_tensor_with_pad(slot_mapping, From 99f8e9f4cf2213f3fadf136b30e5ab768f9af2e1 Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Wed, 26 Feb 2025 06:46:14 +0000 Subject: [PATCH 17/34] cleanup --- test_multimodal.py | 34 ------------------- .../model_executor/layers/rotary_embedding.py | 2 +- vllm/model_executor/models/qwen2_5_vl.py | 3 +- vllm/model_executor/models/qwen2_vl.py | 31 +++-------------- vllm/model_executor/models/registry.py | 1 - 5 files changed, 7 insertions(+), 64 deletions(-) delete mode 100644 test_multimodal.py diff --git a/test_multimodal.py b/test_multimodal.py deleted file mode 100644 index f216e06d2ed2f..0000000000000 --- a/test_multimodal.py +++ /dev/null @@ -1,34 +0,0 @@ -from vllm import LLM -from vllm import SamplingParams -from vllm.assets.image import ImageAsset -import PIL -import multiprocessing - -def main(): - # Load the image - image = ImageAsset("stop_sign").pil_image - - sampling_params = SamplingParams(temperature=0.8, top_p=0.95 ) - # Initialize the LLM with a multimodal model like LLaVA - # llava-hf/llava-1.5-7b-hf - # Qwen/Qwen2-VL-7B-Instruct - # meta-llama/Llama-3.2-11B-Vision-Instruct -> /root/sasarkar/clean_model_garden/models--meta-llama--Llama-3.2-11B-Vision-Instruct - llm = LLM(model="Qwen/Qwen2-VL-7B-Instruct", enforce_eager=False) - #llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", enforce_eager=True) - #llm = LLM(model="llava-hf/llava-1.5-7b-hf") - #llm = LLM(model="/root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-11B-Vision-Instruct/snapshots/9eb2daaa8597bf192a8b0e73f848f3a102794df5/", tensor_parallel_size=2,) - # Create the prompt with image data - # llava prompt - #prompt = "USER: \nWhat is the content of this image?\nASSISTANT: <|image|>" - #prompt = "" * 576 + ("\nUSER: What is the content of this image?\nASSISTANT:") - # qwen2-vl prompt - prompt = '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n' - outputs = llm.generate({"prompt": prompt, "multi_modal_data": {"image": image}}) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -if __name__ == "__main__": - multiprocessing.freeze_support() - main() \ No newline at end of file diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index ccecdcfbb1a39..6ebab8927a92b 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -830,7 +830,7 @@ def forward( key: [num_tokens, num_kv_heads * head_size] """ assert positions.ndim == 1 or positions.ndim == 2 - # print(f" rotary_emd positions {positions.shape} query {query.shape} key {key.shape}") + num_tokens = positions.shape[-1] cos_sin = self.cos_sin_cache[positions] cos, sin = cos_sin.chunk(2, dim=-1) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 669b7e759fa59..714f2a73a20ff 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1066,7 +1066,6 @@ def forward( in seconds) for each grid along the temporal dimension in the 3D position IDs. `None` if no videos are passed. """ - print(f"> qwen2_5_vl.py: input_ids {input_ids.shape} positions {positions.shape}") if intermediate_tensors is not None: inputs_embeds = None @@ -1128,4 +1127,4 @@ def get_mm_mapping(self) -> MultiModelKeys: return MultiModelKeys.from_string_field( language_model="language_model", connector="visual.", - tower_model="visual.merger.") \ No newline at end of file + tower_model="visual.merger.") diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 786ab2177e59d..961f53cef1379 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -580,7 +580,7 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) hpos_ids = hpos_ids.reshape( - h // self.spatial_merge_size, # buggy... 62/2 is yielding 0 .. seems its ok wo hpu graphs + h // self.spatial_merge_size, self.spatial_merge_size, w // self.spatial_merge_size, self.spatial_merge_size, @@ -1112,14 +1112,6 @@ def _validate_and_reshape_mm_tensor(self, mm_input: object, raise ValueError(f"{name} should be 2D or batched 3D tensor. " f"Got ndim: {mm_input.ndim} " f"(shape={mm_input.shape})") - # sasarkar buggy ... seems an issue with hpu graph? - ''' - (Pdb) (list(mm_input)) -[tensor([[ 1, 62, 92]], device='hpu:0')] -(Pdb) torch.concat(list(mm_input)) -tensor([[0, 0, 0]], device='hpu:0') - - ''' return torch.concat(list(mm_input)) else: return torch.concat(mm_input) @@ -1134,13 +1126,10 @@ def _parse_and_validate_image_input( return None if pixel_values is not None: - # sasarkar: _validate_and_reshape_mm_tensor seems to be messing up the values some how - #pixel_values = self._validate_and_reshape_mm_tensor( - # pixel_values, "image pixel values") - #image_grid_thw = self._validate_and_reshape_mm_tensor( - # image_grid_thw, "image grid_thw") - pixel_values = pixel_values.view(-1, pixel_values.shape[-1]) - image_grid_thw = image_grid_thw.view(-1, image_grid_thw.shape[-1]) + pixel_values = self._validate_and_reshape_mm_tensor( + pixel_values, "image pixel values") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") if not isinstance(pixel_values, (torch.Tensor, list)): raise ValueError("Incorrect type of image pixel values. " @@ -1151,7 +1140,6 @@ def _parse_and_validate_image_input( image_grid_thw=image_grid_thw) if image_embeds is not None: - assert False, "Call me if this is hit" image_embeds = self._validate_and_reshape_mm_tensor( image_embeds, "image embeds") image_grid_thw = self._validate_and_reshape_mm_tensor( @@ -1369,15 +1357,6 @@ def forward( video_input=video_input) input_ids = None - ''' - During "warmup": ... have switched off warmup memory for now - hpu: - input_ids, positions: 32x1024 - - CPU: - input_ids is None - positions: 3x1451: - ''' hidden_states = self.language_model.model( input_ids=input_ids, positions=positions, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index f04867070c479..7260d973bfb28 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -135,7 +135,6 @@ "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 - "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # [Auto-converted (see adapters.py)] "Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForCausalLM"), # Technically PrithviGeoSpatialMAE is a model that works on images, both in From 9eac068909b06df739d28df98b34e98ee85caf8c Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Wed, 26 Feb 2025 17:00:30 +0000 Subject: [PATCH 18/34] polishing --- vllm/model_executor/models/qwen2_5_vl.py | 1 + vllm/worker/hpu_model_runner.py | 161 ++++++++++++----------- 2 files changed, 85 insertions(+), 77 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 714f2a73a20ff..2c7f0807c828b 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1066,6 +1066,7 @@ def forward( in seconds) for each grid along the temporal dimension in the 3D position IDs. `None` if no videos are passed. """ + if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 27924da5ecec1..8273933c3926f 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -169,6 +169,9 @@ def get_target_layer_suffix_list(model_type) -> list[str]: decoder_layer_table.get(model_type, "DecoderLayer"), "EncoderLayer" ] +def get_hpu_disable_tensor_cache(): + env_var = os.environ.get('HPU_DISABLE_TENSOR_CACHE', 'true') + return env_var.lower() == 'true' def modify_model_layers(module: torch.nn.Module, suffix_list: list[str], @@ -227,6 +230,27 @@ def find_rope_layer(parent, path): # Return the result if found, otherwise None return path_to_rope +def build_and_pad_mrope_positions(input_positions: List[List[int]], + input_mrope_positions: List[List[List[int]]], + max_prompt_len) -> Optional[List[List[int]]]: + # Qwen2.5vl expects 3 lists of positions, we are going to pad each + # seq_data in the list using either MRope values for multi-modal + # or regular position for text only inputs + mrope_input_positions = [[] for _ in range(3)] + for idx in range(3): + for b_idx, input_mrope_position in enumerate(input_mrope_positions): + if input_mrope_position is not None: + positions = input_mrope_position[idx] + else: + # use regular positions as default + positions = input_positions[b_idx] + padded_positions = make_tensor_with_pad([positions], + max_len=max_prompt_len, + pad=0, + dtype=torch.long, + device='cpu').flatten().tolist() + mrope_input_positions[idx].extend(padded_positions) + return mrope_input_positions class HpuModelAdapter: @@ -438,7 +462,8 @@ def forward(self, *args, **kwargs): kwargs['attn_metadata'] = self._update_metadata( kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), input_ids.device, self.dtype) - LoraMask.setLoraMask(kwargs.pop('lora_mask')) + if 'lora_mask' in kwargs: + LoraMask.setLoraMask(kwargs.pop('lora_mask')) model_config = getattr(self.model, "config", None) model_is_mrope = uses_mrope(model_config) if self.layer_names is not None and not model_is_mrope: @@ -884,10 +909,10 @@ def _add_dummy_seq(self, seq_group_metadata_list, is_prompt): return seq_group_metadata_list, real_batch_size, batch_size_padded def _maybe_wrap_in_hpu_graph(self, *args, **kwargs): - import os - workaround = os.environ.get('WORKAROUND', '0') == '1' # there is also a flag provided for disabletensorcache + disable_tensor_cache = get_hpu_disable_tensor_cache() return htorch.hpu.wrap_in_hpu_graph( - HpuModelAdapter(*args, **kwargs), disable_tensor_cache=not workaround # orig code its set to True + HpuModelAdapter(*args, **kwargs), + disable_tensor_cache=disable_tensor_cache, ) if htorch.utils.internal.is_lazy() else HpuModelAdapter( *args, **kwargs) @@ -938,13 +963,34 @@ def _check_config(self, batch_size, seq_len, attn_metadata, warmup_mode): "Configuration: (%s, %s, %s, %s) was not warmed-up!", phase, batch_size, seq_len, num_blocks) + def _get_mrope_positions_and_delta(self, seq_data, mm_kwargs, context_len): + image_grid_thw = mm_kwargs.get("image_grid_thw", None) + video_grid_thw = mm_kwargs.get("video_grid_thw", None) + second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) + assert image_grid_thw is not None or video_grid_thw is not None, ( + "mrope embedding type requires multi-modal input mapper " + "returns 'image_grid_thw' or 'video_grid_thw'.") + hf_config = self.model_config.hf_config + token_ids = seq_data.get_token_ids() + mrope_positions, mrope_position_delta = \ + MRotaryEmbedding.get_input_positions( + token_ids, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + context_len=context_len, + ) + assert mrope_positions is not None + return mrope_positions, mrope_position_delta + def _prepare_prompt( self, seq_group_metadata_list: List[SequenceGroupMetadata], ) -> PreparePromptMetadata: input_tokens: List[List[int]] = [] input_positions: List[List[int]] = [] - input_mrope_positions: List[List[int]] = [] + input_mrope_positions: List[List[List[int]]] = [] slot_mapping: List[List[int]] = [] lora_index_mapping: List[List[int]] = [] lora_prompt_mapping: List[List[int]] = [] @@ -1011,12 +1057,11 @@ def _prepare_prompt( context_lens.append(context_len) query_lens.append(seq_len - context_len) input_tokens.append(prompt_tokens) - # print("tokens", prompt_tokens) # NOTE(woosuk): Here we assume that the first token in the prompt # is always the first token in the sequence. input_positions.append(list(range(context_len, seq_len))) - batch_input_mrope_positions = None + seq_data_mrope_positions = None if seq_group_metadata.multi_modal_data: positions = input_positions[0] mm_data, placeholder_maps = MultiModalPlaceholderMap \ @@ -1031,30 +1076,19 @@ def _prepare_prompt( seq_group_metadata.mm_processor_kwargs, ) + # special processing for mrope position deltas. if self.model_is_mrope: - image_grid_thw = mm_kwargs.get("image_grid_thw", None) - video_grid_thw = mm_kwargs.get("video_grid_thw", None) - second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) - assert image_grid_thw is not None or video_grid_thw is not None, ( - "mrope embedding type requires multi-modal input mapper " - "returns 'image_grid_thw' or 'video_grid_thw'.") - - hf_config = self.model_config.hf_config - token_ids = seq_data.get_token_ids() mrope_positions, mrope_position_delta = \ - MRotaryEmbedding.get_input_positions( - token_ids, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - context_len=context_len, - ) + self._get_mrope_positions_and_delta( + seq_data=seq_data, + mm_kwargs=mm_kwargs, + context_len=context_len) assert mrope_positions is not None seq_data.mrope_position_delta = mrope_position_delta - batch_input_mrope_positions = [[] for _ in range(3)] + seq_data_mrope_positions = [[] for _ in range(3)] for idx in range(3): - batch_input_mrope_positions[idx].extend(mrope_positions[idx]) + seq_data_mrope_positions[idx] \ + .extend(mrope_positions[idx]) multi_modal_kwargs_list.append(mm_kwargs) @@ -1062,7 +1096,7 @@ def _prepare_prompt( multi_modal_placeholder_maps[modality].extend( placeholder_map) - input_mrope_positions.append(batch_input_mrope_positions) + input_mrope_positions.append(seq_data_mrope_positions) if seq_group_metadata.block_tables is None: # During memory profiling, the block tables are not initialized @@ -1149,29 +1183,15 @@ def _prepare_prompt( dtype=torch.long, device='cpu') - mrope_input_positions: Optional[List[List[int]]] = None - if any(mrope_position is not None - for mrope_position in input_mrope_positions): - assert self.model_is_mrope - mrope_input_positions = [[] for _ in range(3)] - for idx in range(3): - for b_idx, input_mrope_position in enumerate(input_mrope_positions): - if input_mrope_position is None: - positions = input_positions[b_idx] - else: - positions = input_mrope_position[idx] - padded_positions = make_tensor_with_pad([positions], - max_len=max_prompt_len, - pad=0, - dtype=torch.long, - device='cpu').flatten().tolist() - mrope_input_positions[idx].extend(padded_positions) + if self.model_is_mrope: + padded_input_mrope_positions = \ + build_and_pad_mrope_positions(input_positions=input_positions, + input_mrope_positions=input_mrope_positions, + max_prompt_len=max_prompt_len) input_positions = None # type: ignore - input_positions_tensor = torch.tensor(mrope_input_positions, + input_positions_tensor = torch.tensor(padded_input_mrope_positions, dtype=torch.long, - device='cpu', - ) - # print(f" ABC: input positions with MROPE shape is {input_positions_tensor.shape}") + device='cpu') else: input_mrope_positions = None # type: ignore input_positions_tensor = make_tensor_with_pad(input_positions, @@ -1179,11 +1199,6 @@ def _prepare_prompt( pad=0, dtype=torch.long, device='cpu') - if self.model_is_mrope: - # Qwen 2.5 vl works with flatten input_positions - input_positions_tensor = input_positions_tensor.flatten() - # print(f" ABC: input positions no mrope shape is {input_positions_tensor.shape}") - slot_mapping = make_tensor_with_pad(slot_mapping, max_len=max_prompt_len, @@ -1311,21 +1326,19 @@ def _prepare_decode( seq_len = seq_data.get_len() position = seq_len - 1 - input_positions.append([position]) - if seq_data.mrope_position_delta is not None: - context_len = seq_data.get_num_computed_tokens() - pos_for_mrope = MRotaryEmbedding.get_next_input_positions( - seq_data.mrope_position_delta, - context_len, - seq_len, - ) - else: - pos_for_mrope = [[position]] * 3 - - for idx in range(3): - input_mrope_positions[idx].extend(pos_for_mrope[idx]) + if self.model_is_mrope: + if seq_data.mrope_position_delta is not None: + pos_for_mrope = MRotaryEmbedding \ + .get_next_input_positions( + seq_data.mrope_position_delta, + seq_data.get_num_computed_tokens(), + seq_len) + else: + pos_for_mrope = [[position]] * 3 + for idx in range(3): + input_mrope_positions[idx].extend(pos_for_mrope[idx]) seq_len = seq_len if self.sliding_window is None else min( seq_len, self.sliding_window) @@ -1602,10 +1615,11 @@ def prepare_input_tensors( decode_slot_mapping, decode_lora_ids, ) = self._prepare_decode(decode_reqs) - sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, - seq_lens, query_lens, - self.device, - self.pin_memory) + + if not self.is_pooler: + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, seq_lens, query_lens, self.device, + self.pin_memory) if not self.scheduler_config.chunked_prefill_enabled: assert (len(prefill_reqs) and len(decode_reqs)) == 0 @@ -1776,13 +1790,6 @@ def create_dummy_seq_group_metadata(self, sampling_params = SamplingParams(temperature=temperature) num_blocks = math.ceil(seq_len / self.block_size) seq_len = max(seq_len, 1) - # TODO: Add dummy data with metadata info - # encoder_dummy_data \ - # = self.input_registry.dummy_data_for_profiling( - # self.model_config, - # seq_len, - # self.mm_registry, - # is_encoder_data=True) if is_prompt: input_len = seq_len output_len = 0 From dcc2c6c12c92d2a638c91bf46563a99ef3b43b35 Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Wed, 26 Feb 2025 18:59:49 +0000 Subject: [PATCH 19/34] add type ignore --- vllm/worker/hpu_model_runner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 8273933c3926f..8485a51dfc3e9 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -236,7 +236,7 @@ def build_and_pad_mrope_positions(input_positions: List[List[int]], # Qwen2.5vl expects 3 lists of positions, we are going to pad each # seq_data in the list using either MRope values for multi-modal # or regular position for text only inputs - mrope_input_positions = [[] for _ in range(3)] + mrope_input_positions: List[List[int]] = [[] for _ in range(3)] for idx in range(3): for b_idx, input_mrope_position in enumerate(input_mrope_positions): if input_mrope_position is not None: @@ -1377,9 +1377,9 @@ def _prepare_decode( if self.model_is_mrope: - input_positions = None + input_positions = None # type: ignore else: - input_mrope_positions = None + input_mrope_positions = None # type: ignore input_positions = torch.tensor(input_positions or input_mrope_positions, dtype=torch.long, From 7c5871b1a4f2dc2ebe9b42edd48df5fca05505ec Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Wed, 26 Feb 2025 22:59:32 +0000 Subject: [PATCH 20/34] set HPU_DISABLE_TENSOR_CACHE to false for Qwen2.5vl --- vllm/worker/hpu_model_runner.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 8485a51dfc3e9..c4a557cfb0a81 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -910,6 +910,11 @@ def _add_dummy_seq(self, seq_group_metadata_list, is_prompt): def _maybe_wrap_in_hpu_graph(self, *args, **kwargs): disable_tensor_cache = get_hpu_disable_tensor_cache() + if self.model_is_mrope: + logger.warning( + "Setting HPU_DISABLE_TENSOR_CACHE to False for this model" + ) + disable_tensor_cache = False return htorch.hpu.wrap_in_hpu_graph( HpuModelAdapter(*args, **kwargs), disable_tensor_cache=disable_tensor_cache, From fc9e7eeaaa43fc82d9beaf8a08495f36c0724f93 Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Wed, 26 Feb 2025 23:50:00 +0000 Subject: [PATCH 21/34] make lint happy? --- vllm/worker/hpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index c4a557cfb0a81..d7bebb007dd18 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1066,7 +1066,7 @@ def _prepare_prompt( # is always the first token in the sequence. input_positions.append(list(range(context_len, seq_len))) - seq_data_mrope_positions = None + seq_data_mrope_positions : Optional[List[List[int]]] = None if seq_group_metadata.multi_modal_data: positions = input_positions[0] mm_data, placeholder_maps = MultiModalPlaceholderMap \ From 67b696eb44af9102e4451f10781d7e964f448e31 Mon Sep 17 00:00:00 2001 From: Jimin Ha Date: Wed, 26 Feb 2025 11:48:32 -0800 Subject: [PATCH 22/34] Change torch dtype to bflat16 for qwen2.5-VL test --- tests/models/decoder_only/vision_language/test_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 2c66edb539dce..48159a33da32f 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -156,6 +156,7 @@ vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[pytest.mark.core_model, pytest.mark.cpu_model], + dtype=("bfloat16" if current_platform.is_hpu() else "half") ), #### Extended model tests "aria": VLMTestInfo( From cf97bed89d0944ccc2b736a9a3a1803224168ff3 Mon Sep 17 00:00:00 2001 From: Iman Gohari Date: Thu, 27 Feb 2025 00:41:13 +0000 Subject: [PATCH 23/34] fea(): Added the tests requirements Co-authored-by: Mohit Deopujari mohit.deopujari@intel.com Co-authored-by: Jimin Ha jimin.ha@intel.com Co-authored-by: Pallavi Jaini pallavi.jaini@intel.com Co-authored-by: Deepak Narayana deepak.narayana@intel.com Co-authored-by: Sayantan Sarkar sayantan.sarkar@intel.com Co-authored-by: Gustavo Malkomes gustavo.malkomes@intel.com --- requirements-hpu-qwen2_5_vl.txt | 1 + tests/conftest.py | 5 ++++- tests/models/decoder_only/vision_language/vlm_utils/core.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 requirements-hpu-qwen2_5_vl.txt diff --git a/requirements-hpu-qwen2_5_vl.txt b/requirements-hpu-qwen2_5_vl.txt new file mode 100644 index 0000000000000..21bcfbfe0b11c --- /dev/null +++ b/requirements-hpu-qwen2_5_vl.txt @@ -0,0 +1 @@ +transformers @ git+https://github.com/huggingface/transformers.git@6b550462139655d488d4c663086a63e98713c6b9 diff --git a/tests/conftest.py b/tests/conftest.py index 7fa6a35317ff7..060034d878ae0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -258,7 +258,10 @@ def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: return x if device is None: - device = "cpu" if current_platform.is_cpu() else "cuda" + if current_platform.is_hpu(): + device = "hpu" + else: + device = "cpu" if current_platform.is_cpu() else "cuda" if isinstance(x, dict): return {k: self.wrap_device(v, device) for k, v in x.items()} diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py index f2260f56737d9..d688728ce8664 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/core.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py @@ -51,7 +51,7 @@ def run_test( model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - model_info.check_transformers_version(on_fail="skip") + #model_info.check_transformers_version(on_fail="skip") vllm_outputs_per_mm = [] hf_outputs_per_mm = [] From c986f8da1cac5ff617f01dcf5b2f6876f77ac735 Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Thu, 27 Feb 2025 03:57:57 +0000 Subject: [PATCH 24/34] add check_transformers to qwen2_5_VL --- tests/conftest.py | 4 +++- tests/models/decoder_only/vision_language/vlm_utils/core.py | 2 +- tests/models/registry.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 060034d878ae0..6700b7ca6d08d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -260,8 +260,10 @@ def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: if device is None: if current_platform.is_hpu(): device = "hpu" + elif current_platform.is_cpu(): + device = "cpu" else: - device = "cpu" if current_platform.is_cpu() else "cuda" + device = "cuda" if isinstance(x, dict): return {k: self.wrap_device(v, device) for k, v in x.items()} diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py index d688728ce8664..f2260f56737d9 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/core.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py @@ -51,7 +51,7 @@ def run_test( model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - #model_info.check_transformers_version(on_fail="skip") + model_info.check_transformers_version(on_fail="skip") vllm_outputs_per_mm = [] hf_outputs_per_mm = [] diff --git a/tests/models/registry.py b/tests/models/registry.py index c3e1c7859799c..cda981f86b7ad 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -278,7 +278,7 @@ def check_available_online( "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501 "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501 - min_transformers_version="4.49"), # noqa: E501 + min_transformers_version="4.48.9"), # noqa: E501 "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", trust_remote_code=True), # [Encoder-decoder] From 08b35bffc7f8cb8e32c6ddff4569550234dba6a6 Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Thu, 27 Feb 2025 06:04:06 +0000 Subject: [PATCH 25/34] improving code and comments --- vllm/model_executor/models/qwen2_5_vl.py | 3 +- vllm/worker/hpu_model_runner.py | 44 ++++++++++++++---------- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 2c7f0807c828b..5049f33d19cce 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -621,7 +621,8 @@ def remove_duplicates_cpu(a): cu_window_seqlens, device=hidden_states.device, dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) - # This is not a static operation, removing duplicates earlier on CPU + # NOTE: unique_consecutive is a dynamic operation + # we are replacing it with the `remove_duplicates_cpu` above #cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) seq_len, _ = hidden_states.size() diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index d7bebb007dd18..24bb3b7f11658 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -230,27 +230,35 @@ def find_rope_layer(parent, path): # Return the result if found, otherwise None return path_to_rope -def build_and_pad_mrope_positions(input_positions: List[List[int]], +def make_mrope_positions_tensor_with_pad(input_positions: List[List[int]], input_mrope_positions: List[List[List[int]]], - max_prompt_len) -> Optional[List[List[int]]]: - # Qwen2.5vl expects 3 lists of positions, we are going to pad each - # seq_data in the list using either MRope values for multi-modal - # or regular position for text only inputs + max_prompt_len: int, + pad: int) -> Optional[List[List[int]]]: + # If no mrope positions, returns a flatten (seq_len,) + if all(mrope_position is None for mrope_position in input_mrope_positions): + return make_tensor_with_pad(input_positions, + max_len=max_prompt_len, + pad=0, + dtype=torch.long, + device='cpu').flatten() + # Otherwise, Qwen2.5-VL expects positions in a (3, seq_len) + # we are going to pad each seq_data in the list + # using either MRope values or regular position mrope_input_positions: List[List[int]] = [[] for _ in range(3)] for idx in range(3): for b_idx, input_mrope_position in enumerate(input_mrope_positions): if input_mrope_position is not None: positions = input_mrope_position[idx] else: - # use regular positions as default - positions = input_positions[b_idx] - padded_positions = make_tensor_with_pad([positions], - max_len=max_prompt_len, - pad=0, - dtype=torch.long, - device='cpu').flatten().tolist() + positions = input_positions[b_idx] + padding_size = max_prompt_len - len(positions) + assert padding_size >= 0 + padded_positions = positions \ + + (max_prompt_len - len(positions)) * [pad] mrope_input_positions[idx].extend(padded_positions) - return mrope_input_positions + return torch.tensor(mrope_input_positions, + dtype=torch.long, + device='cpu') class HpuModelAdapter: @@ -1189,14 +1197,12 @@ def _prepare_prompt( device='cpu') if self.model_is_mrope: - padded_input_mrope_positions = \ - build_and_pad_mrope_positions(input_positions=input_positions, + input_positions_tensor = \ + make_mrope_positions_tensor_with_pad(input_positions=input_positions, input_mrope_positions=input_mrope_positions, - max_prompt_len=max_prompt_len) + max_prompt_len=max_prompt_len, + pad=0) input_positions = None # type: ignore - input_positions_tensor = torch.tensor(padded_input_mrope_positions, - dtype=torch.long, - device='cpu') else: input_mrope_positions = None # type: ignore input_positions_tensor = make_tensor_with_pad(input_positions, From 75eb21bba2536494a708a6b5fd121cb708966d7c Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Thu, 27 Feb 2025 22:50:09 +0000 Subject: [PATCH 26/34] lint --- vllm/worker/hpu_model_runner.py | 43 ++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 24bb3b7f11658..aa126de71c126 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -42,8 +42,8 @@ from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) @@ -169,10 +169,12 @@ def get_target_layer_suffix_list(model_type) -> list[str]: decoder_layer_table.get(model_type, "DecoderLayer"), "EncoderLayer" ] + def get_hpu_disable_tensor_cache(): env_var = os.environ.get('HPU_DISABLE_TENSOR_CACHE', 'true') return env_var.lower() == 'true' + def modify_model_layers(module: torch.nn.Module, suffix_list: list[str], n=1, @@ -230,10 +232,11 @@ def find_rope_layer(parent, path): # Return the result if found, otherwise None return path_to_rope -def make_mrope_positions_tensor_with_pad(input_positions: List[List[int]], - input_mrope_positions: List[List[List[int]]], - max_prompt_len: int, - pad: int) -> Optional[List[List[int]]]: + +def make_mrope_positions_tensor_with_pad( + input_positions: List[List[int]], + input_mrope_positions: List[List[List[int]]], max_prompt_len: int, + pad: int) -> Optional[List[List[int]]]: # If no mrope positions, returns a flatten (seq_len,) if all(mrope_position is None for mrope_position in input_mrope_positions): return make_tensor_with_pad(input_positions, @@ -256,9 +259,8 @@ def make_mrope_positions_tensor_with_pad(input_positions: List[List[int]], padded_positions = positions \ + (max_prompt_len - len(positions)) * [pad] mrope_input_positions[idx].extend(padded_positions) - return torch.tensor(mrope_input_positions, - dtype=torch.long, - device='cpu') + return torch.tensor(mrope_input_positions, dtype=torch.long, device='cpu') + class HpuModelAdapter: @@ -455,7 +457,7 @@ def _prepare_cos_sin(self, positions): positions, recompute_cos_sin=self.recompute_cos_sin) else: raise AttributeError( - "The module at the end of the path does not have \ + "The module at the end of the path does not have \ a 'prepare_cos_sin' method.") def forward(self, *args, **kwargs): @@ -920,8 +922,7 @@ def _maybe_wrap_in_hpu_graph(self, *args, **kwargs): disable_tensor_cache = get_hpu_disable_tensor_cache() if self.model_is_mrope: logger.warning( - "Setting HPU_DISABLE_TENSOR_CACHE to False for this model" - ) + "Setting HPU_DISABLE_TENSOR_CACHE to False for this model") disable_tensor_cache = False return htorch.hpu.wrap_in_hpu_graph( HpuModelAdapter(*args, **kwargs), @@ -1074,7 +1075,7 @@ def _prepare_prompt( # is always the first token in the sequence. input_positions.append(list(range(context_len, seq_len))) - seq_data_mrope_positions : Optional[List[List[int]]] = None + seq_data_mrope_positions: Optional[List[List[int]]] = None if seq_group_metadata.multi_modal_data: positions = input_positions[0] mm_data, placeholder_maps = MultiModalPlaceholderMap \ @@ -1109,7 +1110,8 @@ def _prepare_prompt( multi_modal_placeholder_maps[modality].extend( placeholder_map) - input_mrope_positions.append(seq_data_mrope_positions) + input_mrope_positions.append( + seq_data_mrope_positions) # type: ignore if seq_group_metadata.block_tables is None: # During memory profiling, the block tables are not initialized @@ -1205,11 +1207,12 @@ def _prepare_prompt( input_positions = None # type: ignore else: input_mrope_positions = None # type: ignore - input_positions_tensor = make_tensor_with_pad(input_positions, - max_len=max_prompt_len, - pad=0, - dtype=torch.long, - device='cpu') + input_positions_tensor = make_tensor_with_pad( + input_positions, + max_len=max_prompt_len, + pad=0, + dtype=torch.long, + device='cpu') slot_mapping = make_tensor_with_pad(slot_mapping, max_len=max_prompt_len, @@ -1386,13 +1389,13 @@ def _prepare_decode( real_batch_size = len(seq_group_metadata_list) input_tokens = output[:real_batch_size].clone() - if self.model_is_mrope: input_positions = None # type: ignore else: input_mrope_positions = None # type: ignore - input_positions = torch.tensor(input_positions or input_mrope_positions, + input_positions = torch.tensor(input_positions + or input_mrope_positions, dtype=torch.long, device='cpu') From 70ef9404f48cbf0551ebe05286f817d58c7e8279 Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Thu, 27 Feb 2025 23:07:22 +0000 Subject: [PATCH 27/34] remove Optinal --- vllm/worker/hpu_model_runner.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index aa126de71c126..5b0990fe03d1e 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -233,10 +233,11 @@ def find_rope_layer(parent, path): return path_to_rope -def make_mrope_positions_tensor_with_pad( +def make_mrope_positions_tensor_with_pad( \ input_positions: List[List[int]], - input_mrope_positions: List[List[List[int]]], max_prompt_len: int, - pad: int) -> Optional[List[List[int]]]: + input_mrope_positions: List[List[List[int]]], + max_prompt_len: int, + pad: int) -> List[List[int]]: # If no mrope positions, returns a flatten (seq_len,) if all(mrope_position is None for mrope_position in input_mrope_positions): return make_tensor_with_pad(input_positions, From 15d735c002df690e60f48ac43e854a1f56ee6714 Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Thu, 27 Feb 2025 23:17:18 +0000 Subject: [PATCH 28/34] lint qwen2_5_vl --- vllm/model_executor/models/qwen2_5_vl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 5049f33d19cce..8ce5437cae7b8 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -612,10 +612,10 @@ def forward( # windows attention window_index, cu_window_seqlens = self.get_window_index(grid_thw) + def remove_duplicates_cpu(a): - return [ - a[i] for i in range(len(a)) if i==0 or a[i-1]!= a[i] - ] + return [a[i] for i in range(len(a)) if i == 0 or a[i - 1] != a[i]] + cu_window_seqlens = remove_duplicates_cpu(cu_window_seqlens) cu_window_seqlens = torch.tensor( cu_window_seqlens, From f6b95f8a43a2221fb7eb9718fa5cf8a4ca7978bf Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Fri, 28 Feb 2025 16:25:30 +0000 Subject: [PATCH 29/34] add reviewers suggestions --- README_GAUDI.md | 2 +- vllm/model_executor/models/qwen2_5_vl.py | 31 +++++++++++++++--------- vllm/worker/hpu_model_runner.py | 12 +-------- 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/README_GAUDI.md b/README_GAUDI.md index ce3b263aa46f9..a355b0638ecc8 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -372,7 +372,7 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM - `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used. `1` is the default. - `PT_HPU_ENABLE_LAZY_COLLECTIVES` must be set to `true` for tensor parallel inference with HPU Graphs. -- `PT_HPUGRAPH_DISABLE_TENSOR_CACHE` must be set to `false` for llava model. +- `PT_HPUGRAPH_DISABLE_TENSOR_CACHE` must be set to `false` for llava and qwen models. - `VLLM_PROMPT_USE_FLEX_ATTENTION` is enabled only for llama model, and allows to use torch.nn.attention.flex_attention instead of FusedSDPA. Note, this requires `VLLM_PROMPT_USE_FUSEDSDPA=0` # Quantization, FP8 Inference and Model Calibration Process diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 8ce5437cae7b8..54b4e214e4ed0 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -57,7 +57,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFieldConfig -from vllm.platforms import _Backend +from vllm.platforms import _Backend, current_platform from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope @@ -71,6 +71,7 @@ from .vision import get_vit_attn_backend logger = init_logger(__name__) +is_hpu = current_platform.is_hpu() # === Vision Inputs === # @@ -613,17 +614,23 @@ def forward( # windows attention window_index, cu_window_seqlens = self.get_window_index(grid_thw) - def remove_duplicates_cpu(a): - return [a[i] for i in range(len(a)) if i == 0 or a[i - 1] != a[i]] - - cu_window_seqlens = remove_duplicates_cpu(cu_window_seqlens) - cu_window_seqlens = torch.tensor( - cu_window_seqlens, - device=hidden_states.device, - dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) - # NOTE: unique_consecutive is a dynamic operation - # we are replacing it with the `remove_duplicates_cpu` above - #cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) + if is_hpu: + # NOTE: unique_consecutive is a dynamic operation + # we are using `remove_duplicates_cpu` instead + def remove_duplicates_cpu(a): + return [a[i] for i in range(len(a)) if i == 0 or a[i - 1] != a[i]] + + cu_window_seqlens = remove_duplicates_cpu(cu_window_seqlens) + cu_window_seqlens = torch.tensor( + cu_window_seqlens, + device=hidden_states.device, + dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) + else: + cu_window_seqlens = torch.tensor( + cu_window_seqlens, + device=hidden_states.device, + dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) + cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) seq_len, _ = hidden_states.size() hidden_states = hidden_states.reshape( diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 5b0990fe03d1e..1c2745c13f909 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -170,11 +170,6 @@ def get_target_layer_suffix_list(model_type) -> list[str]: ] -def get_hpu_disable_tensor_cache(): - env_var = os.environ.get('HPU_DISABLE_TENSOR_CACHE', 'true') - return env_var.lower() == 'true' - - def modify_model_layers(module: torch.nn.Module, suffix_list: list[str], n=1, @@ -920,14 +915,9 @@ def _add_dummy_seq(self, seq_group_metadata_list, is_prompt): return seq_group_metadata_list, real_batch_size, batch_size_padded def _maybe_wrap_in_hpu_graph(self, *args, **kwargs): - disable_tensor_cache = get_hpu_disable_tensor_cache() - if self.model_is_mrope: - logger.warning( - "Setting HPU_DISABLE_TENSOR_CACHE to False for this model") - disable_tensor_cache = False return htorch.hpu.wrap_in_hpu_graph( HpuModelAdapter(*args, **kwargs), - disable_tensor_cache=disable_tensor_cache, + disable_tensor_cache=True, ) if htorch.utils.internal.is_lazy() else HpuModelAdapter( *args, **kwargs) From 175a927d51d2c163a6059a47f766ed9a2a86901a Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Fri, 28 Feb 2025 16:39:54 +0000 Subject: [PATCH 30/34] lint --- vllm/model_executor/models/qwen2_5_vl.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 54b4e214e4ed0..e04a718da4f65 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -618,18 +618,24 @@ def forward( # NOTE: unique_consecutive is a dynamic operation # we are using `remove_duplicates_cpu` instead def remove_duplicates_cpu(a): - return [a[i] for i in range(len(a)) if i == 0 or a[i - 1] != a[i]] + return [ + a[i] for i in range(len(a)) if i == 0 or a[i - 1] != a[i] + ] cu_window_seqlens = remove_duplicates_cpu(cu_window_seqlens) cu_window_seqlens = torch.tensor( cu_window_seqlens, device=hidden_states.device, - dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) + dtype=grid_thw.dtype + if torch.jit.is_tracing() else torch.int32) + else: cu_window_seqlens = torch.tensor( cu_window_seqlens, device=hidden_states.device, - dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) + dtype=grid_thw.dtype + if torch.jit.is_tracing() else torch.int32) + cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) seq_len, _ = hidden_states.size() From 5baa1ed6951d53d143ace65ac5ceebfde06d41fd Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Fri, 28 Feb 2025 16:54:48 +0000 Subject: [PATCH 31/34] remove blank line --- vllm/model_executor/models/qwen2_5_vl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index e04a718da4f65..c7fa789b556d5 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -635,7 +635,6 @@ def remove_duplicates_cpu(a): device=hidden_states.device, dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) - cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) seq_len, _ = hidden_states.size() From 7fe109a7e292e1b9ee18087bbc22df0b3806134a Mon Sep 17 00:00:00 2001 From: Gustavo Malkomes Date: Mon, 10 Mar 2025 15:56:15 +0000 Subject: [PATCH 32/34] input_mrope_positions if/else simplifications --- vllm/utils.py | 30 +++++++++++++++++++ vllm/worker/hpu_model_runner.py | 52 +++++---------------------------- 2 files changed, 38 insertions(+), 44 deletions(-) diff --git a/vllm/utils.py b/vllm/utils.py index 216808f51e01d..da79625572cde 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -823,6 +823,36 @@ def make_tensor_with_pad( return tensor +def make_mrope_positions_tensor_with_pad( \ + input_positions: List[List[int]], + input_mrope_positions: List[List[List[int]]], + max_prompt_len: int, + pad: int) -> List[List[int]]: + # If no mrope positions, returns a flatten (seq_len,) + if all(mrope_position is None for mrope_position in input_mrope_positions): + return make_tensor_with_pad(input_positions, + max_len=max_prompt_len, + pad=0, + dtype=torch.long, + device='cpu').flatten() + # Otherwise, Qwen2.5-VL expects positions in a (3, seq_len) + # we are going to pad each seq_data in the list + # using either MRope values or regular position + mrope_input_positions: List[List[int]] = [[] for _ in range(3)] + for idx in range(3): + for b_idx, input_mrope_position in enumerate(input_mrope_positions): + if input_mrope_position is not None: + positions = input_mrope_position[idx] + else: + positions = input_positions[b_idx] + padding_size = max_prompt_len - len(positions) + assert padding_size >= 0 + padded_positions = positions \ + + (max_prompt_len - len(positions)) * [pad] + mrope_input_positions[idx].extend(padded_positions) + return torch.tensor(mrope_input_positions, dtype=torch.long, device='cpu') + + def make_tensor_with_pad_align( x: List[List[T]], pad: T, diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 1c2745c13f909..32d993c724f90 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -59,7 +59,7 @@ SequenceOutput) from vllm.transformers_utils.config import uses_mrope from vllm.utils import (bind_kv_cache, is_fake_hpu, is_pin_memory_available, - make_tensor_with_pad) + make_tensor_with_pad, make_mrope_positions_tensor_with_pad) from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, _add_attn_metadata_broadcastable_dict, @@ -228,36 +228,6 @@ def find_rope_layer(parent, path): return path_to_rope -def make_mrope_positions_tensor_with_pad( \ - input_positions: List[List[int]], - input_mrope_positions: List[List[List[int]]], - max_prompt_len: int, - pad: int) -> List[List[int]]: - # If no mrope positions, returns a flatten (seq_len,) - if all(mrope_position is None for mrope_position in input_mrope_positions): - return make_tensor_with_pad(input_positions, - max_len=max_prompt_len, - pad=0, - dtype=torch.long, - device='cpu').flatten() - # Otherwise, Qwen2.5-VL expects positions in a (3, seq_len) - # we are going to pad each seq_data in the list - # using either MRope values or regular position - mrope_input_positions: List[List[int]] = [[] for _ in range(3)] - for idx in range(3): - for b_idx, input_mrope_position in enumerate(input_mrope_positions): - if input_mrope_position is not None: - positions = input_mrope_position[idx] - else: - positions = input_positions[b_idx] - padding_size = max_prompt_len - len(positions) - assert padding_size >= 0 - padded_positions = positions \ - + (max_prompt_len - len(positions)) * [pad] - mrope_input_positions[idx].extend(padded_positions) - return torch.tensor(mrope_input_positions, dtype=torch.long, device='cpu') - - class HpuModelAdapter: def __init__(self, model, vllm_config, layer_names): @@ -1190,15 +1160,13 @@ def _prepare_prompt( device='cpu') if self.model_is_mrope: - input_positions_tensor = \ + input_positions = \ make_mrope_positions_tensor_with_pad(input_positions=input_positions, input_mrope_positions=input_mrope_positions, max_prompt_len=max_prompt_len, pad=0) - input_positions = None # type: ignore else: - input_mrope_positions = None # type: ignore - input_positions_tensor = make_tensor_with_pad( + input_positions = make_tensor_with_pad( input_positions, max_len=max_prompt_len, pad=0, @@ -1232,7 +1200,7 @@ def _prepare_prompt( self.device, non_blocking=True) input_tokens_tensor = input_tokens_tensor.to( # type: ignore self.device, non_blocking=True) - input_positions_tensor = input_positions_tensor.to( # type: ignore + input_positions = input_positions.to( # type: ignore self.device, non_blocking=True) slot_mapping = slot_mapping.to( # type: ignore self.device, non_blocking=True) @@ -1267,7 +1235,7 @@ def _prepare_prompt( self.device, non_blocking=True) return PreparePromptMetadata(input_tokens=input_tokens_tensor, - input_positions=input_positions_tensor, + input_positions=input_positions, attn_metadata=attn_metadata, seq_lens=seq_lens, query_lens=query_lens, @@ -1380,13 +1348,9 @@ def _prepare_decode( real_batch_size = len(seq_group_metadata_list) input_tokens = output[:real_batch_size].clone() - if self.model_is_mrope: - input_positions = None # type: ignore - else: - input_mrope_positions = None # type: ignore - - input_positions = torch.tensor(input_positions - or input_mrope_positions, + input_positions = torch.tensor(input_mrope_positions + if self.model_is_mrope + else input_positions, dtype=torch.long, device='cpu') From 264676d31f71d48afd99fb546c892a148aae57d6 Mon Sep 17 00:00:00 2001 From: Jimin Ha Date: Mon, 10 Mar 2025 09:37:24 -0700 Subject: [PATCH 33/34] Enable FusedSDPA for Qwen2.5 VL --- vllm/model_executor/models/qwen2_5_vl.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index c7fa789b556d5..0b3f9014568b9 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -313,10 +313,14 @@ def forward( v_i = v[:, start_idx:end_idx] q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]) - output_i = F.scaled_dot_product_attention(q_i, - k_i, - v_i, - dropout_p=0.0) + if is_hpu: + from habana_frameworks.torch.hpex.kernels import FusedSDPA + output_i = FusedSDPA.apply(q_i, k_i, v_i, None, 0.0) + else: + output_i = F.scaled_dot_product_attention(q_i, + k_i, + v_i, + dropout_p=0.0) output_i = rearrange(output_i, "b h s d -> b s h d ") outputs.append(output_i) context_layer = torch.cat(outputs, dim=1) From cb09a4b266d0a45ab44cbbb662b64b594fab86fb Mon Sep 17 00:00:00 2001 From: Jimin Ha Date: Mon, 10 Mar 2025 18:03:02 +0000 Subject: [PATCH 34/34] Lint fix --- vllm/worker/hpu_model_runner.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 32d993c724f90..adef55180cdc7 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -59,7 +59,8 @@ SequenceOutput) from vllm.transformers_utils.config import uses_mrope from vllm.utils import (bind_kv_cache, is_fake_hpu, is_pin_memory_available, - make_tensor_with_pad, make_mrope_positions_tensor_with_pad) + make_mrope_positions_tensor_with_pad, + make_tensor_with_pad) from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, _add_attn_metadata_broadcastable_dict, @@ -1166,12 +1167,11 @@ def _prepare_prompt( max_prompt_len=max_prompt_len, pad=0) else: - input_positions = make_tensor_with_pad( - input_positions, - max_len=max_prompt_len, - pad=0, - dtype=torch.long, - device='cpu') + input_positions = make_tensor_with_pad(input_positions, + max_len=max_prompt_len, + pad=0, + dtype=torch.long, + device='cpu') slot_mapping = make_tensor_with_pad(slot_mapping, max_len=max_prompt_len, @@ -1348,11 +1348,10 @@ def _prepare_decode( real_batch_size = len(seq_group_metadata_list) input_tokens = output[:real_batch_size].clone() - input_positions = torch.tensor(input_mrope_positions - if self.model_is_mrope - else input_positions, - dtype=torch.long, - device='cpu') + input_positions = torch.tensor( + input_mrope_positions if self.model_is_mrope else input_positions, + dtype=torch.long, + device='cpu') num_decode_tokens = len(seq_lens)