Rebased to update transformers version and addressed comments to edit out older qnn based changes.

quic-dhirajku · quic-dhirajku · commit 068d3c6d5f12 · 2025-04-22T08:50:17.000Z
Signed-off-by: quic-dhirajku &lt;quic_dhirajku@quicinc.com&gt;
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -584,6 +584,7 @@ def export(
         )
 
         self.lang_model.export(inputs["lang"], output_names["lang"], dynamic_axes["lang"], export_dir)
+        return self.onnx_path
 
     def compile(
         self,
@@ -916,7 +917,7 @@ def export(
         inputs = self.model.get_dummy_inputs()
         dynamic_axes = self.model.get_onnx_dynamic_axes()
         output_names = self.model.get_output_names()
-        self._export(inputs, output_names, dynamic_axes, export_dir=export_dir)
+        return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir)
 
     def compile(
         self,
@@ -1170,9 +1171,69 @@ def get_model_config(self) -> dict:
 
 class QEFFAutoModelForImageTextToText:
     """
-    A factory class for creating QEFFAutoModelForImageTextToText instances with for single and Dual QPC approach
+    The QEFFAutoModelForImageTextToText class is used to work with multimodal language models from the HuggingFace hub.
+    While you can initialize the class directly, it's best to use the ``from_pretrained`` method for this purpose. This class supports both single and dual QPC approaches.
     Attributes:
         _hf_auto_class (class): The Hugging Face AutoModel class for ImageTextToText models.
+
+    ``Mandatory`` Args:
+        :pretrained_model_name_or_path (str): Model card name from HuggingFace or local path to model directory.
+
+    ``Optional`` Args:
+        :kv_offload (bool): Flag to toggle between single and dual QPC approaches. If set to False, the Single QPC approach will be used; otherwise, the dual QPC approach will be applied. Defaults to True.
+
+    .. code-block:: python
+        import requests
+        from PIL import Image
+        from transformers import AutoProcessor, TextStreamer
+
+        from QEfficient import QEFFAutoModelForImageTextToText
+
+        # Add HuggingFace Token to access the model
+        HF_TOKEN = ""
+        model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+        query = "Describe this image."
+        image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+
+        ## STEP - 1 Load the Processor and Model, and kv_offload=True/False for dual and single qpc
+        processor = AutoProcessor.from_pretrained(model_name, token=token)
+        model = QEFFAutoModelForImageTextToText.from_pretrained(model_name, token=token, attn_implementation="eager", kv_offload=False)
+
+        ## STEP - 2 Export & Compile the Model
+        model.compile(
+            prefill_seq_len=32,
+            ctx_len=512,
+            img_size=560,
+            num_cores=16,
+            num_devices=1,
+            mxfp6_matmul=False,
+        )
+
+        ## STEP - 3 Load and process the inputs for Inference
+        image = Image.open(requests.get(image_url, stream=True).raw)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": query},
+                ],
+            }
+        ]
+        input_text = [processor.apply_chat_template(messages, add_generation_prompt=True)]
+        inputs = processor(
+            text=input_text,
+            images=image,
+            return_tensors="pt",
+            add_special_tokens=False,
+            padding="max_length",
+            max_length=prefill_seq_len,
+        )
+
+        ## STEP - 4 Run Inference on the compiled model
+        streamer = TextStreamer(processor.tokenizer)
+        model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len)
+
     """
 
     _hf_auto_class = AutoModelForImageTextToText
@@ -1219,7 +1280,6 @@ class QEFFAutoModelForCausalLM(QEFFBaseModel):
         :model (nn.Module):  PyTorch model
         :continuous_batching (bool): Weather this model will be used for continuous batching in future. If this is not set True here, the model can not be exported/compiled for continuous batching later.
         :is_tlm (bool): Whether this is a Speculative Decoding Target Language Model. If set to True, `num_logits_to_keep` input array will have to be fed to control the number of returned logits during prefill/decode.
-        :enable_qnn (bool): Enables QNN Compilation path for the model.
 
 
     .. code-block:: python
@@ -1250,7 +1310,6 @@ def __init__(
         model: nn.Module,
         continuous_batching: bool = False,
         is_tlm: bool = False,
-        enable_qnn: bool = False,
         **kwargs,
     ):
         model_class_name = model.__class__.__name__
@@ -1282,8 +1341,6 @@ def __init__(
             self.model, transformed = SpDTransform.apply(self.model)
         self.is_tlm = is_tlm
 
-        self.enable_qnn = enable_qnn
-
     @property
     def model_name(self) -> str:
         mname = self.model.__class__.__name__
@@ -1292,18 +1349,12 @@ def model_name(self) -> str:
         return mname
 
     def __repr__(self) -> str:
-        return self.__class__.__name__ + "\n" + self.model.__repr__
+        return self.__class__.__name__ + "\n" + self.model.__repr__()
 
     @classmethod
     @with_replaced_quantizers
     def from_pretrained(
-        cls,
-        pretrained_model_name_or_path,
-        continuous_batching: bool = False,
-        is_tlm: bool = False,
-        enable_qnn: bool = False,
-        *args,
-        **kwargs,
+        cls, pretrained_model_name_or_path, continuous_batching: bool = False, is_tlm: bool = False, *args, **kwargs
     ):
         """
         This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModelForCausalLM.
@@ -1314,7 +1365,6 @@ def from_pretrained(
             :pretrained_name_or_path (str): Model card name from HuggingFace or local path to model directory.
             :continuous_batching (bool): Whether this model will be used for continuous batching in future. If this is not set True here, the model can not be exported/compiled for continuous batching later.
             :is_tlm (bool): Whether this is a Speculative Decoding Target Language Model. If set to True, `num_logits_to_keep` input array will have to be fed to control the number of returned logits during prefill/decode.
-            :enable_qnn (bool): Enables QNN Compilation path for the model.
             :args, kwargs: Additional arguments to pass to transformers.AutoModelForCausalLM.
 
         .. code-block:: python
@@ -1348,7 +1398,6 @@ def from_pretrained(
         kv_offload = kwargs.pop("kv_offload", None)
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
-
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
 
         # This is support models that should be classified to in a different auto class but transformers load them via this class
@@ -1358,7 +1407,7 @@ def from_pretrained(
                 model, kv_offload=kv_offload
             )
 
-        return cls(model, is_tlm=is_tlm, continuous_batching=continuous_batching, enable_qnn=enable_qnn)
+        return cls(model, is_tlm=is_tlm, continuous_batching=continuous_batching)
 
     @property
     def model_hash(self) -> str:
@@ -1738,20 +1787,26 @@ def export(self, export_dir: Optional[str] = None) -> str:
         inputs = self.model.get_dummy_inputs()
         dynamic_axes = self.model.get_onnx_dynamic_axes()
         output_names = self.model.get_output_names()
-        self._export(inputs, output_names, dynamic_axes, export_dir=export_dir)
+        return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir)
 
     def compile(
         self,
         onnx_path: Optional[str] = None,
         compile_dir: Optional[str] = None,
         *,
-        encoder_ctx_len: int = 1500,
-        decoder_ctx_len: int = 150,
-        feature_len: int = 3000,
+        prefill_seq_len: Optional[int] = 1,
+        encoder_ctx_len: Optional[int] = None,
+        ctx_len: int = 150,
+        full_batch_size: Optional[int] = None,
+        kv_cache_batch_size: Optional[int] = None,
         batch_size: int = 1,
         num_devices: int = 1,
         num_cores: int = 16,  # FIXME: Make this mandatory arg
         mxfp6_matmul: bool = False,
+        mxint8_kv_cache: bool = False,
+        num_speculative_tokens: Optional[int] = None,
+        enable_qnn: bool = False,
+        qnn_config: Optional[str] = None,
         **compiler_options,
     ) -> str:
         """
@@ -1762,19 +1817,41 @@ def compile(
         ``Optional`` Args:
             :onnx_path (str, optional): Path to pre-exported onnx model.
             :compile_dir (str, optional): Path for saving the qpc generated.
-            :seq_len (int, optional): The length of the prompt should be less that ``seq_len``. ``Defaults to 32``.
+            :encoder_ctx_len (int, optional): The maximum length of context for encoder, based on the AutoProcessor output. ``Defaults to checking config, if None in config then 1500``
+            :ctx_len (int, optional): The maximum length of context to keep for decoding. ``Defaults to 150``.
             :batch_size (int, optional): Batch size. ``Defaults to 1``.
             :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1.
             :num_cores (int): Number of cores used to compile the model.
             :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``.
             :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
-            :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
+
+            Other args are not yet implemented for AutoModelForSpeechSeq2Seq
         Returns:
             :str: Path of the compiled ``qpc`` package.
         """
-        specializations = self.model.get_specializations(batch_size, encoder_ctx_len, decoder_ctx_len, feature_len)
+        specializations, compiler_options = self.model.get_specializations(
+            batch_size,
+            encoder_ctx_len,
+            ctx_len,
+            **compiler_options,
+        )
 
-        self._compile(
+        if full_batch_size:
+            logger.warning("Continuous batching is not yet enabled for AutoModelForSpeechSeq2Seq")
+
+        if kv_cache_batch_size:
+            logger.warning("Prefix caching is not yet enabled for AutoModelForSpeechSeq2Seq")
+
+        if mxint8_kv_cache:
+            logger.warning("mxint8 cache is not yet enabled for AutoModelForSpeechSeq2Seq")
+
+        if num_speculative_tokens:
+            logger.warning("Speculative decoding is not yet enabled for AutoModelForSpeechSeq2Seq")
+
+        if enable_qnn or qnn_config:
+            logger.warning("QNN compile is not yet enabled for AutoModelForSpeechSeq2Seq")
+
+        return self._compile(
             onnx_path,
             compile_dir,
             compile_only=True,
@@ -1792,7 +1869,6 @@ def generate(
         inputs: torch.Tensor,
         generation_len: int,
         streamer: Optional[TextStreamer] = None,
-        enable_debug_logs: bool = False,
         device_ids: List[int] = None,
     ) -> Union[torch.Tensor, np.ndarray]:
         """
@@ -1801,9 +1877,8 @@ def generate(
 
         ``Mandatory`` Args:
             :processor: autoprocessor to process inputs and decode logits
-            :inputs (np.ndarray): inputs to run the execution.
+            :inputs (torch.Tensor): inputs to run the execution.
             :generation_len (int): length upto which to generate
-            :sample_rate (int): sampling rate at which input audio is stored in inputs (needed for processor)
             :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
         Returns:
             :dict: Output from the ``AI_100`` or ``PyTorch`` runtime.
@@ -1814,9 +1889,20 @@ def generate(
         inputs = self.auto_correct_inputs(inputs)
 
         if self.qpc_session is None:
-            self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids, enable_debug_logs=enable_debug_logs)
+            self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids)
             self.batch_size = self.qpc_session.bindings[0].dims[0]
 
+        inputs["input_features"] = inputs["input_features"].numpy().astype(np.float32)
+
+        # add start token id and initial position ids to inputs
+        seq_len = 1
+        inputs["decoder_input_ids"] = (
+            torch.ones((self.batch_size, seq_len), dtype=torch.int64) * self.model.config.decoder_start_token_id
+        ).numpy()
+        inputs["decoder_position_ids"] = (
+            torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(self.batch_size, 1).numpy()
+        )
+
         self.qpc_session.skip_buffers(
             [x for x in self.qpc_session.input_names + self.qpc_session.output_names if x.startswith("past_")]
         )