huggingface · ArthurZucker · Mar 5, 2024 · Jan 22, 2024 · Jan 23, 2024 · Jan 23, 2024
diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
@@ -56,7 +56,7 @@ RUN python3 -m pip install --no-cache-dir auto-gptq --extra-index-url https://hu
 RUN python3 -m pip install --no-cache-dir einops
 
 # Add autoawq for quantization testing
-RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.8/autoawq-0.1.8+cu118-cp38-cp38-linux_x86_64.whl
+RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.9/autoawq-0.1.9+cu118-cp38-cp38-linux_x86_64.whl
 
 # For bettertransformer + gptq
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum

diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
@@ -17,7 +17,11 @@
 
 
 _import_structure = {
-    "awq": ["fuse_awq_modules", "replace_with_awq_linear"],
+    "awq": [
+        "fuse_awq_modules",
+        "post_init_awq_exllama_modules",
+        "replace_with_awq_linear",
+    ],
     "bitsandbytes": [
         "get_keys_to_not_convert",
         "replace_8bit_linear",
@@ -80,7 +84,11 @@
 }
 
 if TYPE_CHECKING:
-    from .awq import fuse_awq_modules, replace_with_awq_linear
+    from .awq import (
+        fuse_awq_modules,
+        post_init_awq_exllama_modules,
+        replace_with_awq_linear,
+    )
     from .bitsandbytes import (
         get_keys_to_not_convert,
         replace_8bit_linear,

diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py
@@ -15,7 +15,11 @@
 from ..activations import ACT2FN
 from ..modeling_utils import PreTrainedModel
 from ..utils import is_auto_awq_available, is_torch_available
-from ..utils.quantization_config import AwqBackendPackingMethod, AwqConfig, AWQLinearVersion
+from ..utils.quantization_config import (
+    AwqBackendPackingMethod,
+    AwqConfig,
+    AWQLinearVersion,
+)
 
 
 if is_torch_available():
@@ -91,13 +95,23 @@ def replace_with_awq_linear(
         )
 
     if backend == AwqBackendPackingMethod.AUTOAWQ:
-        from awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV
-    elif backend == AwqBackendPackingMethod.LLMAWQ:
-        from awq.quantize.qmodule import WQLinear
+        if quantization_config.version == AWQLinearVersion.GEMM:
+            from awq.modules.linear.gemm import WQLinear_GEMM
 
-    if backend == AwqBackendPackingMethod.AUTOAWQ:
-        target_cls = WQLinear_GEMM if quantization_config.version == AWQLinearVersion.GEMM else WQLinear_GEMV
+            target_cls = WQLinear_GEMM
+        elif quantization_config.version == AWQLinearVersion.GEMV:
+            from awq.modules.linear.gemv import WQLinear_GEMV
+
+            target_cls = WQLinear_GEMV
+        elif quantization_config.version == AWQLinearVersion.EXLLAMA:
+            from awq.modules.linear.exllamav2 import WQLinear_ExllamaV2
+
+            target_cls = WQLinear_ExllamaV2
+        else:
+            raise ValueError(f"Unsupported AWQLinearVersion: {quantization_config.version}")
     else:
+        from awq.quantize.qmodule import WQLinear
+
         target_cls = WQLinear
 
     for name, module in model.named_children():
@@ -372,3 +386,18 @@ def _fuse_awq_attention_layers(model, module, modules_to_fuse, current_module_na
         setattr(parent, child_name, fused_attention_layer.to(previous_device))
 
         del q_proj, k_proj, v_proj, o_proj
+
+
+def post_init_awq_exllama_modules(model):
+    """
+    Runs post init for Exllama layers which performs:
+        - Weights unpacking, reordering and repacking
+        - Devices scratch space allocation
+    """
+    from awq.modules.linear.exllamav2 import exllamav2_post_init
+
+    # default values for exllamav2 from
+    # https://github.com/AutoGPTQ/AutoGPTQ/blob/6ba14f17ef73c161c2c4707cbf0b41e569a9c6dd/auto_gptq/nn_modules/qlinear/qlinear_exllamav2.py#L171
+    model = exllamav2_post_init(model, max_input_len=2048, max_batch_size=8)
+
+    return model
diff --git a/src/transformers/quantizers/quantizer_awq.py b/src/transformers/quantizers/quantizer_awq.py
@@ -20,6 +20,7 @@
     from ..modeling_utils import PreTrainedModel
 
 from ..utils import is_accelerate_available, is_auto_awq_available, is_torch_available, logging
+from ..utils.quantization_config import AWQLinearVersion
 
 
 if is_torch_available():
@@ -95,12 +96,22 @@ def _process_model_after_weight_loading(self, model):
             model = fuse_awq_modules(model, self.quantization_config)
             model._awq_is_fused = True  # TODO: consider storing this flag in model.config instead
 
+        if self.quantization_config.version == AWQLinearVersion.EXLLAMA:
+            from ..integrations import post_init_awq_exllama_modules
+
+            model = post_init_awq_exllama_modules(model)
+
     @property
     def is_serializable(self):
         # AWQ through auto-awq has been always serializable, except if the model is fused.
         if self.quantization_config.do_fuse:
             logger.warning("You cannot save an AWQ model that uses fused modules!")
             return False
+
+        if self.quantization_config.version == AWQLinearVersion.EXLLAMA:
+            logger.warning("You cannot save an AWQ model that uses Exllama backend!")
+            return False
+
         return True
 
     @property

diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
@@ -43,6 +43,7 @@ class QuantizationMethod(str, Enum):
 class AWQLinearVersion(str, Enum):
     GEMM = "gemm"
     GEMV = "gemv"
+    EXLLAMA = "exllama"
 
     @staticmethod
     def from_str(version: str):
@@ -51,6 +52,8 @@ def from_str(version: str):
             return AWQLinearVersion.GEMM
         elif version == "gemv":
             return AWQLinearVersion.GEMV
+        elif version == "exllama":
+            return AWQLinearVersion.EXLLAMA
         else:
             raise ValueError(f"Unknown AWQLinearVersion {version}")
 
@@ -669,9 +672,9 @@ def post_init(self):
             )
 
         self.version = AWQLinearVersion.from_str(self.version)
-        if self.version not in [AWQLinearVersion.GEMM, AWQLinearVersion.GEMV]:
+        if self.version not in [AWQLinearVersion.GEMM, AWQLinearVersion.GEMV, AWQLinearVersion.EXLLAMA]:
             raise ValueError(
-                f"Only supported versions are in [AWQLinearVersion.GEMM, AWQLinearVersion.GEMV] - not recognized version {self.version}"
+                f"Only supported versions are in [AWQLinearVersion.GEMM, AWQLinearVersion.GEMV, AWQLinearVersion.EXLLAMA] - not recognized version {self.version}"
             )
 
         if self.backend == AwqBackendPackingMethod.LLMAWQ:
@@ -726,8 +729,22 @@ def post_init(self):
                     f"Required fields are missing in the fusing mapping, required fields are {required_keys}"
                 )
 
+        if self.version == AWQLinearVersion.EXLLAMA:
+            awq_version_supports_exllama = False
+            MIN_AWQ_VERSION = "0.1.9"
+            if is_auto_awq_available():
+                awq_version_supports_exllama = version.parse(importlib.metadata.version("autoawq")) >= version.parse(
+                    MIN_AWQ_VERSION
+                )
+
+            if not awq_version_supports_exllama:
+                raise ValueError(
+                    f"You current version of `autoawq` does not support exllama backend, "
+                    f"please upgrade `autoawq` package to at least {MIN_AWQ_VERSION}."
+                )
+
     def get_loading_attributes(self):
         attibutes_dict = copy.deepcopy(self.__dict__)
-        loading_attibutes = ["do_fuse", "modules_to_fuse", "fuse_max_seq_len"]
+        loading_attibutes = ["version", "do_fuse", "modules_to_fuse", "fuse_max_seq_len"]
         loading_attibutes_dict = {i: j for i, j in attibutes_dict.items() if i in loading_attibutes}
         return loading_attibutes_dict
diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py
@@ -192,6 +192,20 @@ def test_quantized_model_bf16(self):
         output = quantized_model.generate(**input_ids, max_new_tokens=40)
         self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_BF16)
 
+    def test_quantized_model_exllama(self):
+        """
+        Simple test that checks if the quantized model is working properly with exllama backend
+        """
+        input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
+
+        quantization_config = AwqConfig(version="exllama")
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name, quantization_config=quantization_config
+        ).to(torch_device)
+
+        output = quantized_model.generate(**input_ids, max_new_tokens=40)
+        self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+
     def test_quantized_model_no_device_map(self):
         """
         Simple test that checks if the quantized model is working properly