From 76bc0a8a97a09d28a3429a0d397d8ffe9751672e Mon Sep 17 00:00:00 2001
From: kq-chen <kq.chen@outlook.com>
Date: Thu, 14 Nov 2024 14:15:56 +0800
Subject: [PATCH] add qwen2vl support (#599)

Co-authored-by: Casper <casperbh.96@gmail.com>
---
 awq/models/__init__.py     |   1 +
 awq/models/auto.py         |   1 +
 awq/models/base.py         |  17 +-
 awq/models/qwen2vl.py      |  75 ++++++++
 awq/utils/qwen_vl_utils.py | 339 +++++++++++++++++++++++++++++++++++++
 docs/examples.md           | 199 ++++++++++++++++++++++
 6 files changed, 627 insertions(+), 5 deletions(-)
 create mode 100644 awq/models/qwen2vl.py
 create mode 100644 awq/utils/qwen_vl_utils.py

diff --git a/awq/models/__init__.py b/awq/models/__init__.py
index 2f1a88e2..9b3a4f27 100644
--- a/awq/models/__init__.py
+++ b/awq/models/__init__.py
@@ -24,3 +24,4 @@
 from .deepseek_v2 import DeepseekV2AWQForCausalLM
 from .minicpm import MiniCPMAWQForCausalLM
 from .internlm2 import InternLM2AWQForCausalLM
+from .qwen2vl import Qwen2VLAWQForCausalLM
\ No newline at end of file
diff --git a/awq/models/auto.py b/awq/models/auto.py
index 1ce1b21d..495722ab 100644
--- a/awq/models/auto.py
+++ b/awq/models/auto.py
@@ -34,6 +34,7 @@
     "deepseek_v2": DeepseekV2AWQForCausalLM,
     "minicpm": MiniCPMAWQForCausalLM,
     "internlm2": InternLM2AWQForCausalLM,
+    "qwen2_vl": Qwen2VLAWQForCausalLM,
 }
 
 
diff --git a/awq/models/base.py b/awq/models/base.py
index 2da5095d..abfb9b38 100644
--- a/awq/models/base.py
+++ b/awq/models/base.py
@@ -39,7 +39,7 @@
     PreTrainedModel,
     PretrainedConfig,
     AutoProcessor,
-    CLIPImageProcessor,
+    BaseImageProcessor,
     PreTrainedTokenizer,
 )
 from accelerate.big_modeling import (
@@ -74,6 +74,7 @@
     "baichuan": "AutoModelForCausalLM",
     "llava": "AutoModelForVision2Seq",
     "qwen2": "AutoModelForCausalLM",
+    "qwen2_vl": "AutoModelForVision2Seq",
     "gemma": "AutoModelForCausalLM",
     "gemma2": "AutoModelForCausalLM",
     "stablelm": "AutoModelForCausalLM",
@@ -84,6 +85,7 @@
     "deepseek_v2": "AutoModelForCausalLM",
     "minicpm": "AutoModelForCausalLM",
     "internlm2": "AutoModelForCausalLM",
+    "qwen2_vl": "AutoModelForVision2Seq",
 }
 
 
@@ -100,7 +102,7 @@ def __init__(
             AwqConfig, Doc("The quantization config of the model.")
         ],
         processor: Annotated[
-            AutoProcessor, Doc("An optional processor, e.g. for vision models.")
+            BaseImageProcessor, Doc("An optional processor, e.g. for vision models.")
         ],
     ):
         """The base model for all AutoAWQ models."""
@@ -111,7 +113,7 @@ def __init__(
         self.search_result = None
         self.config: PretrainedConfig = config
         self.quant_config: AwqConfig = quant_config
-        self.processor: CLIPImageProcessor = processor
+        self.processor: BaseImageProcessor = processor
 
     def to(self, device: Annotated[str, Doc("The device to move your model to.")]):
         """A utility function for moving the model to a device."""
@@ -186,6 +188,11 @@ def quantize(
         ] = 1024
         * 1024
         * 1024,
+        quantizer_cls: Annotated[
+            AwqQuantizer,
+            Doc("If you want to customize the quantization class, you can use AwqQuantizer as a base class.")
+        ] = AwqQuantizer,
+        **kwargs,
     ):
         """
         The main quantization function that you can use to quantize your model.
@@ -209,7 +216,7 @@ def quantize(
         if hasattr(self, "modules_to_not_convert"):
             self.quant_config.modules_to_not_convert = self.modules_to_not_convert
 
-        self.quantizer = AwqQuantizer(
+        self.quantizer = quantizer_cls(
             self,
             self.model,
             tokenizer,
@@ -228,6 +235,7 @@ def quantize(
             max_calib_samples=max_calib_samples,
             max_calib_seq_len=max_calib_seq_len,
             max_chunk_memory=max_chunk_memory,
+            **kwargs,
         )
         self.quantizer.quantize()
 
@@ -373,7 +381,6 @@ def from_pretrained(
         processor = None
         if target_cls_name == "AutoModelForVision2Seq":
             processor = AutoProcessor.from_pretrained(model_weights_path)
-            processor: CLIPImageProcessor = processor.image_processor
 
         # If not quantized, must load with AutoModelForCausalLM
         model = target_cls.from_pretrained(
diff --git a/awq/models/qwen2vl.py b/awq/models/qwen2vl.py
new file mode 100644
index 00000000..4d8ad213
--- /dev/null
+++ b/awq/models/qwen2vl.py
@@ -0,0 +1,75 @@
+from .base import BaseAWQForCausalLM
+from typing_extensions import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from transformers import Qwen2VLForConditionalGeneration
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLDecoderLayer
+
+class Qwen2VLAWQForCausalLM(BaseAWQForCausalLM):
+    layer_type = "Qwen2VLDecoderLayer"
+    max_seq_len_key = "max_position_embeddings"
+    modules_to_not_convert = ["visual"]
+
+    @staticmethod
+    def get_model_layers(model: "Qwen2VLForConditionalGeneration"):
+        return model.model.layers
+
+    @staticmethod
+    def get_act_for_scaling(module: "Qwen2VLForConditionalGeneration"):
+        return dict(is_scalable=False)
+
+    @staticmethod
+    def move_embed(model: "Qwen2VLForConditionalGeneration", device: str):
+        model.model.embed_tokens = model.model.embed_tokens.to(device)
+        model.visual = model.visual.to(device)
+
+    @staticmethod
+    def get_layers_for_scaling(module: "Qwen2VLDecoderLayer", input_feat, module_kwargs):
+        layers = []
+
+        # attention input
+        layers.append(
+            dict(
+                prev_op=module.input_layernorm,
+                layers=[
+                    module.self_attn.q_proj,
+                    module.self_attn.k_proj,
+                    module.self_attn.v_proj,
+                ],
+                inp=input_feat["self_attn.q_proj"],
+                module2inspect=module.self_attn,
+                kwargs=module_kwargs,
+            )
+        )
+
+        # attention out
+        # Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696
+        if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
+            layers.append(
+                dict(
+                    prev_op=module.self_attn.v_proj,
+                    layers=[module.self_attn.o_proj],
+                    inp=input_feat["self_attn.o_proj"],
+                )
+            )
+
+        # linear 1
+        layers.append(
+            dict(
+                prev_op=module.post_attention_layernorm,
+                layers=[module.mlp.gate_proj, module.mlp.up_proj],
+                inp=input_feat["mlp.gate_proj"],
+                module2inspect=module.mlp,
+            )
+        )
+
+        # linear 2
+        layers.append(
+            dict(
+                prev_op=module.mlp.up_proj,
+                layers=[module.mlp.down_proj],
+                inp=input_feat["mlp.down_proj"],
+            )
+        )
+
+        return layers
\ No newline at end of file
diff --git a/awq/utils/qwen_vl_utils.py b/awq/utils/qwen_vl_utils.py
new file mode 100644
index 00000000..08ba02f7
--- /dev/null
+++ b/awq/utils/qwen_vl_utils.py
@@ -0,0 +1,339 @@
+from __future__ import annotations
+
+import base64
+import logging
+import math
+import os
+import sys
+import time
+import warnings
+from functools import lru_cache
+from io import BytesIO
+
+import requests
+import torch
+import torchvision
+from packaging import version
+from PIL import Image
+from torchvision import io, transforms
+from torchvision.transforms import InterpolationMode
+
+
+logger = logging.getLogger(__name__)
+
+IMAGE_FACTOR = 28
+MIN_PIXELS = 4 * 28 * 28
+MAX_PIXELS = 16384 * 28 * 28
+MAX_RATIO = 200
+
+VIDEO_MIN_PIXELS = 128 * 28 * 28
+VIDEO_MAX_PIXELS = 768 * 28 * 28
+VIDEO_TOTAL_PIXELS = 24576 * 28 * 28
+FRAME_FACTOR = 2
+FPS = 2.0
+FPS_MIN_FRAMES = 4
+FPS_MAX_FRAMES = 768
+
+
+def round_by_factor(number: int, factor: int) -> int:
+    """Returns the closest integer to 'number' that is divisible by 'factor'."""
+    return round(number / factor) * factor
+
+
+def ceil_by_factor(number: int, factor: int) -> int:
+    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
+    return math.ceil(number / factor) * factor
+
+
+def floor_by_factor(number: int, factor: int) -> int:
+    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
+    return math.floor(number / factor) * factor
+
+
+def smart_resize(
+    height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
+) -> tuple[int, int]:
+    """
+    Rescales the image so that the following conditions are met:
+
+    1. Both dimensions (height and width) are divisible by 'factor'.
+
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    if max(height, width) / min(height, width) > MAX_RATIO:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+    return h_bar, w_bar
+
+
+def fetch_image(ele: dict[str, str | Image.Image], size_factor: int = IMAGE_FACTOR) -> Image.Image:
+    if "image" in ele:
+        image = ele["image"]
+    else:
+        image = ele["image_url"]
+    image_obj = None
+    if isinstance(image, Image.Image):
+        image_obj = image
+    elif image.startswith("http://") or image.startswith("https://"):
+        image_obj = Image.open(requests.get(image, stream=True).raw)
+    elif image.startswith("file://"):
+        image_obj = Image.open(image[7:])
+    elif image.startswith("data:image"):
+        if "base64," in image:
+            _, base64_data = image.split("base64,", 1)
+            data = base64.b64decode(base64_data)
+            image_obj = Image.open(BytesIO(data))
+    else:
+        image_obj = Image.open(image)
+    if image_obj is None:
+        raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
+    image = image_obj.convert("RGB")
+    ## resize
+    if "resized_height" in ele and "resized_width" in ele:
+        resized_height, resized_width = smart_resize(
+            ele["resized_height"],
+            ele["resized_width"],
+            factor=size_factor,
+        )
+    else:
+        width, height = image.size
+        min_pixels = ele.get("min_pixels", MIN_PIXELS)
+        max_pixels = ele.get("max_pixels", MAX_PIXELS)
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=size_factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+    image = image.resize((resized_width, resized_height))
+
+    return image
+
+
+def smart_nframes(
+    ele: dict,
+    total_frames: int,
+    video_fps: int | float,
+) -> int:
+    """calculate the number of frames for video used for model inputs.
+
+    Args:
+        ele (dict): a dict contains the configuration of video.
+            support either `fps` or `nframes`:
+                - nframes: the number of frames to extract for model inputs.
+                - fps: the fps to extract frames for model inputs.
+                    - min_frames: the minimum number of frames of the video, only used when fps is provided.
+                    - max_frames: the maximum number of frames of the video, only used when fps is provided.
+        total_frames (int): the original total number of frames of the video.
+        video_fps (int | float): the original fps of the video.
+
+    Raises:
+        ValueError: nframes should in interval [FRAME_FACTOR, total_frames].
+
+    Returns:
+        int: the number of frames for video used for model inputs.
+    """
+    assert not ("fps" in ele and "nframes" in ele), "Only accept either `fps` or `nframes`"
+    if "nframes" in ele:
+        nframes = round_by_factor(ele["nframes"], FRAME_FACTOR)
+    else:
+        fps = ele.get("fps", FPS)
+        min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR)
+        max_frames = floor_by_factor(ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR)
+        nframes = total_frames / video_fps * fps
+        nframes = min(max(nframes, min_frames), max_frames)
+        nframes = round_by_factor(nframes, FRAME_FACTOR)
+    if not (FRAME_FACTOR <= nframes and nframes <= total_frames):
+        raise ValueError(f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}.")
+    return nframes
+
+
+def _read_video_torchvision(
+    ele: dict,
+) -> torch.Tensor:
+    """read video using torchvision.io.read_video
+
+    Args:
+        ele (dict): a dict contains the configuration of video.
+        support keys:
+            - video: the path of video. support "file://", "http://", "https://" and local path.
+            - video_start: the start time of video.
+            - video_end: the end time of video.
+    Returns:
+        torch.Tensor: the video tensor with shape (T, C, H, W).
+    """
+    video_path = ele["video"]
+    if version.parse(torchvision.__version__) < version.parse("0.19.0"):
+        if "http://" in video_path or "https://" in video_path:
+            warnings.warn("torchvision < 0.19.0 does not support http/https video path, please upgrade to 0.19.0.")
+        if "file://" in video_path:
+            video_path = video_path[7:]
+    st = time.time()
+    video, audio, info = io.read_video(
+        video_path,
+        start_pts=ele.get("video_start", 0.0),
+        end_pts=ele.get("video_end", None),
+        pts_unit="sec",
+        output_format="TCHW",
+    )
+    total_frames, video_fps = video.size(0), info["video_fps"]
+    logger.info(f"torchvision:  {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
+    nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
+    idx = torch.linspace(0, total_frames - 1, nframes).round().long()
+    video = video[idx]
+    return video
+
+
+def is_decord_available() -> bool:
+    import importlib.util
+
+    return importlib.util.find_spec("decord") is not None
+
+
+def _read_video_decord(
+    ele: dict,
+) -> torch.Tensor:
+    """read video using decord.VideoReader
+
+    Args:
+        ele (dict): a dict contains the configuration of video.
+        support keys:
+            - video: the path of video. support "file://", "http://", "https://" and local path.
+            - video_start: the start time of video.
+            - video_end: the end time of video.
+    Returns:
+        torch.Tensor: the video tensor with shape (T, C, H, W).
+    """
+    import decord
+    video_path = ele["video"]
+    st = time.time()
+    vr = decord.VideoReader(video_path)
+    # TODO: support start_pts and end_pts
+    if 'video_start' in ele or 'video_end' in ele:
+        raise NotImplementedError("not support start_pts and end_pts in decord for now.")
+    total_frames, video_fps = len(vr), vr.get_avg_fps()
+    logger.info(f"decord:  {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
+    nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
+    idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
+    video = vr.get_batch(idx).asnumpy()
+    video = torch.tensor(video).permute(0, 3, 1, 2)  # Convert to TCHW format
+    return video
+
+
+VIDEO_READER_BACKENDS = {
+    "decord": _read_video_decord,
+    "torchvision": _read_video_torchvision,
+}
+
+FORCE_QWENVL_VIDEO_READER = os.getenv("FORCE_QWENVL_VIDEO_READER", None)
+
+
+@lru_cache(maxsize=1)
+def get_video_reader_backend() -> str:
+    if FORCE_QWENVL_VIDEO_READER is not None:
+        video_reader_backend = FORCE_QWENVL_VIDEO_READER
+    elif is_decord_available():
+        video_reader_backend = "decord"
+    else:
+        video_reader_backend = "torchvision"
+    print(f"qwen-vl-utils using {video_reader_backend} to read video.", file=sys.stderr)
+    return video_reader_backend
+
+
+def fetch_video(ele: dict, image_factor: int = IMAGE_FACTOR) -> torch.Tensor | list[Image.Image]:
+    if isinstance(ele["video"], str):
+        video_reader_backend = get_video_reader_backend()
+        video = VIDEO_READER_BACKENDS[video_reader_backend](ele)
+        nframes, _, height, width = video.shape
+
+        min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
+        total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS)
+        max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR), int(min_pixels * 1.05))
+        max_pixels = ele.get("max_pixels", max_pixels)
+        if "resized_height" in ele and "resized_width" in ele:
+            resized_height, resized_width = smart_resize(
+                ele["resized_height"],
+                ele["resized_width"],
+                factor=image_factor,
+            )
+        else:
+            resized_height, resized_width = smart_resize(
+                height,
+                width,
+                factor=image_factor,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+            )
+        video = transforms.functional.resize(
+            video,
+            [resized_height, resized_width],
+            interpolation=InterpolationMode.BICUBIC,
+            antialias=True,
+        ).float()
+        return video
+    else:
+        assert isinstance(ele["video"], (list, tuple))
+        process_info = ele.copy()
+        process_info.pop("type", None)
+        process_info.pop("video", None)
+        images = [
+            fetch_image({"image": video_element, **process_info}, size_factor=image_factor)
+            for video_element in ele["video"]
+        ]
+        nframes = ceil_by_factor(len(images), FRAME_FACTOR)
+        if len(images) < nframes:
+            images.extend([images[-1]] * (nframes - len(images)))
+        return images
+
+
+def extract_vision_info(conversations: list[dict] | list[list[dict]]) -> list[dict]:
+    vision_infos = []
+    if isinstance(conversations[0], dict):
+        conversations = [conversations]
+    for conversation in conversations:
+        for message in conversation:
+            if isinstance(message["content"], list):
+                for ele in message["content"]:
+                    if (
+                        "image" in ele
+                        or "image_url" in ele
+                        or "video" in ele
+                        or ele["type"] in ("image", "image_url", "video")
+                    ):
+                        vision_infos.append(ele)
+    return vision_infos
+
+
+def process_vision_info(
+    conversations: list[dict] | list[list[dict]],
+) -> tuple[list[Image.Image] | None, list[torch.Tensor | list[Image.Image]] | None]:
+    vision_infos = extract_vision_info(conversations)
+    ## Read images or videos
+    image_inputs = []
+    video_inputs = []
+    for vision_info in vision_infos:
+        if "image" in vision_info or "image_url" in vision_info:
+            image_inputs.append(fetch_image(vision_info))
+        elif "video" in vision_info:
+            video_inputs.append(fetch_video(vision_info))
+        else:
+            raise ValueError("image, image_url or video should in content.")
+    if len(image_inputs) == 0:
+        image_inputs = None
+    if len(video_inputs) == 0:
+        video_inputs = None
+    return image_inputs, video_inputs
\ No newline at end of file
diff --git a/docs/examples.md b/docs/examples.md
index 2fc0259c..6032b212 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -274,6 +274,156 @@ subprocess.run([
 ], shell=True, check=True)
 ```
 
+### Custom Quantizer (Qwen2 VL Example)
+
+Below, the Qwen team has provided an example of how to use a custom quantizer. This works to
+effectively quantize the Qwen2 VL model using multimodal examples.
+
+```python
+import torch
+import torch.nn as nn
+
+from awq import AutoAWQForCausalLM
+from awq.utils.qwen_vl_utils import process_vision_info
+from awq.quantize.quantizer import AwqQuantizer, clear_memory, get_best_device
+
+# Specify paths and hyperparameters for quantization
+model_path = "Qwen/Qwen2-VL-7B-Instruct"
+quant_path = "qwen2-vl-7b-instruct"
+quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}
+
+model = AutoAWQForCausalLM.from_pretrained(
+    model_path, use_cache=False, attn_implementation="flash_attention_2"
+)
+
+# We define our own quantizer by extending the AwqQuantizer.
+# The main difference is in how the samples are processed when
+# the quantization process initialized.
+class Qwen2VLAwqQuantizer(AwqQuantizer):
+    def init_quant(self, n_samples=None, max_seq_len=None):
+        modules = self.awq_model.get_model_layers(self.model)
+        samples = self.calib_data
+
+        inps = []
+        layer_kwargs = {}
+
+        best_device = get_best_device()
+        modules[0] = modules[0].to(best_device)
+        self.awq_model.move_embed(self.model, best_device)
+
+        # get input and kwargs to layer 0
+        # with_kwargs is only supported in PyTorch 2.0
+        # use this Catcher hack for now
+        class Catcher(nn.Module):
+            def __init__(self, module):
+                super().__init__()
+                self.module = module
+
+            def forward(self, *args, **kwargs):
+                # assume first input to forward is hidden states
+                if len(args) > 0:
+                    hidden_states = args[0]
+                    del args
+                else:
+                    first_key = list(kwargs.keys())[0]
+                    hidden_states = kwargs.pop(first_key)
+
+                inps.append(hidden_states)
+                layer_kwargs.update(kwargs)
+                raise ValueError  # early exit to break later inference
+
+        def move_to_device(obj: torch.Tensor | nn.Module, device: torch.device):
+            def get_device(obj: torch.Tensor | nn.Module):
+                if isinstance(obj, torch.Tensor):
+                    return obj.device
+                return next(obj.parameters()).device
+
+            if get_device(obj) != device:
+                obj = obj.to(device)
+            return obj
+
+        # patch layer 0 to catch input and kwargs
+        modules[0] = Catcher(modules[0])
+        for k, v in samples.items():
+            if isinstance(v, (torch.Tensor, nn.Module)):
+                samples[k] = move_to_device(v, best_device)
+        try:
+            self.model(**samples)
+        except ValueError:  # work with early exit
+            pass
+        finally:
+            for k, v in samples.items():
+                if isinstance(v, (torch.Tensor, nn.Module)):
+                    samples[k] = move_to_device(v, "cpu")
+        modules[0] = modules[0].module  # restore
+
+        del samples
+        inps = inps[0]
+
+        modules[0] = modules[0].cpu()
+        self.awq_model.move_embed(self.model, "cpu")
+
+        clear_memory()
+
+        return modules, layer_kwargs, inps
+
+# Then you need to prepare your data for calibaration. What you need to do is just put samples into a list,
+# each of which is a typical chat message as shown below. you can specify text and image in `content` field:
+# dataset = [
+#     # message 0
+#     [
+#         {"role": "system", "content": "You are a helpful assistant."},
+#         {"role": "user", "content": "Tell me who you are."},
+#         {"role": "assistant", "content": "I am a large language model named Qwen..."},
+#     ],
+#     # message 1
+#     [
+#         {
+#             "role": "user",
+#             "content": [
+#                 {"type": "image", "image": "file:///path/to/your/image.jpg"},
+#                 {"type": "text", "text": "Output all text in the image"},
+#             ],
+#         },
+#         {"role": "assistant", "content": "The text in the image is balabala..."},
+#     ],
+#     # other messages...
+#     ...,
+# ]
+# here, we use a caption dataset **only for demonstration**. You should replace it with your own sft dataset.
+def prepare_dataset(n_sample: int = 8) -> list[list[dict]]:
+    from datasets import load_dataset
+
+    dataset = load_dataset("laion/220k-GPT4Vision-captions-from-LIVIS", split=f"train[:{n_sample}]")
+    return [
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": sample["url"]},
+                    {"type": "text", "text": "generate a caption for this image"},
+                ],
+            },
+            {"role": "assistant", "content": sample["caption"]},
+        ]
+        for sample in dataset
+    ]
+
+dataset = prepare_dataset()
+
+# process the dataset into tensors
+text = model.processor.apply_chat_template(dataset, tokenize=False, add_generation_prompt=True)
+image_inputs, video_inputs = process_vision_info(dataset)
+inputs = model.processor(text=text, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
+
+# Then just run the calibration process by one line of code
+model.quantize(calib_data=inputs, quant_config=quant_config, quantizer_cls=Qwen2VLAwqQuantizer)
+
+# Save the model
+model.model.config.use_cache = model.model.generation_config.use_cache = True
+model.save_quantized(quant_path, safetensors=True, shard_size="4GB")
+```
+
 ## Basic Inference
 
 ### Inference With GPU
@@ -466,3 +616,52 @@ generation_output = model.generate(
     streamer=streamer
 )
 ```
+
+### Qwen2 VL
+
+Below is an example of how to run inference using Qwen2 VL.
+
+```python
+from awq import AutoAWQForCausalLM
+from awq.utils.qwen_vl_utils import process_vision_info
+from transformers import AutoProcessor, TextStreamer
+
+# Load model
+quant_path = "Qwen/Qwen2-VL-7B-Instruct-AWQ"
+model = AutoAWQForCausalLM.from_quantized(quant_path)
+processor = AutoProcessor.from_pretrained(quant_path)
+streamer = TextStreamer(processor, skip_prompt=True)
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+            },
+            {"type": "text", "text": "Describe this image."},
+        ],
+    }
+]
+
+# Load inputs
+text = processor.apply_chat_template(
+    messages, tokenize=False, add_generation_prompt=True
+)
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(
+    text=[text],
+    images=image_inputs,
+    videos=video_inputs,
+    padding=True,
+    return_tensors="pt",
+)
+inputs = inputs.to("cuda")
+
+generation_output = model.generate(
+    **inputs,
+    max_new_tokens=512,
+    streamer=streamer
+)
+```
\ No newline at end of file