huggingface · MekkCyber · Mar 12, 2025 · Mar 12, 2025 · Mar 12, 2025
diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py
@@ -94,6 +94,7 @@ class GPTQTest(unittest.TestCase):
     EXPECTED_OUTPUTS.add("Hello my name is Aiden, I am a student at the University")
     EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a member of the N")
     EXPECTED_OUTPUTS.add("Hello my name is Nellie and I am a student at the")
+    EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a new member of the")
 
     # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
     EXPECTED_RELATIVE_DIFFERENCE = 1.664253062
@@ -260,7 +261,9 @@ def test_serialization(self):
                 if self.device_map == "cpu":
                     quant_type = "ipex" if is_ipex_available() else "torch"
                 else:
-                    quant_type = "exllama"
+                    # We expecte tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354
+                    # TODO: Remove this once GPTQModel exllama kernels supports packing
+                    quant_type = "tritonv2"
                 quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
                     tmpdirname, device_map=self.device_map
                 )
@@ -424,10 +427,18 @@ def setUpClass(cls):
         cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
 
     def test_quantized_layers_type(self):
-        self.assertEqual(
-            self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
-            "exllama" if is_gptqmodel_available() else "exllamav2",
-        )
+        if is_auto_gptq_available() and not is_gptqmodel_available():
+            self.assertEqual(
+                self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
+                "exllamav2",
+            )
+        else:
+            # We expecte tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354
+            # TODO: Remove this once GPTQModel exllama kernels supports packing
+            self.assertEqual(
+                self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
+                "tritonv2",
+            )
 
     def check_inference_correctness(self, model):
         """