Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions tests/quantization/gptq/test_gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ class GPTQTest(unittest.TestCase):
EXPECTED_OUTPUTS.add("Hello my name is Aiden, I am a student at the University")
EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a member of the N")
EXPECTED_OUTPUTS.add("Hello my name is Nellie and I am a student at the")
EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a new member of the")

# this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
EXPECTED_RELATIVE_DIFFERENCE = 1.664253062
Expand Down Expand Up @@ -260,7 +261,9 @@ def test_serialization(self):
if self.device_map == "cpu":
quant_type = "ipex" if is_ipex_available() else "torch"
else:
quant_type = "exllama"
# We expecte tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354
# TODO: Remove this once GPTQModel exllama kernels supports packing
quant_type = "tritonv2"
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
tmpdirname, device_map=self.device_map
)
Expand Down Expand Up @@ -424,10 +427,18 @@ def setUpClass(cls):
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)

def test_quantized_layers_type(self):
self.assertEqual(
self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
"exllama" if is_gptqmodel_available() else "exllamav2",
)
if is_auto_gptq_available() and not is_gptqmodel_available():
self.assertEqual(
self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
"exllamav2",
)
else:
# We expecte tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354
# TODO: Remove this once GPTQModel exllama kernels supports packing
self.assertEqual(
self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
"tritonv2",
)

def check_inference_correctness(self, model):
"""
Expand Down