From bf8c6e83d0fc3493697eda31995b6ce2627c70f8 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 07:23:31 +0000 Subject: [PATCH 1/7] Add tokenicer.save() API --- gptqmodel/models/base.py | 2 ++ tests/test_tokenicer.py | 19 ++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 1173b406c..b79eef416 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -977,6 +977,8 @@ def save( else: self.save_pretrained(save_dir, **kwargs) + self.tokenizer.save(save_dir) + def compile(self, backend="inductor", mode="max-autotune"): if not self.quantized: logger.warning("model is not quantized, skip compiling...") diff --git a/tests/test_tokenicer.py b/tests/test_tokenicer.py index 243359367..41bb89d49 100644 --- a/tests/test_tokenicer.py +++ b/tests/test_tokenicer.py @@ -17,11 +17,14 @@ import os os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLLE_DEVICES"] = "0" import unittest # noqa: E402 - +import tempfile from gptqmodel import GPTQModel, QuantizeConfig # noqa: E402 from parameterized import parameterized # noqa: E402 +from datasets import load_dataset +from tokenicer.const import VERIFY_JSON_FILE_NAME class TestTokenicer(unittest.TestCase): @@ -78,3 +81,17 @@ def test_tokenicer_decode(self): example, msg=f"Expected example='{self.example}' but got '{example}'." ) + + def test_tokenicer_save(self): + traindata = load_dataset("json", data_files="/monster/data/model/dataset/c4-train.00000-of-01024.json.gz", + split="train") + calibration_dataset = [self.tokenizer(example["text"]) for example in traindata.select(range(32))] + + self.model.quantize(calibration_dataset, batch_size=32) + + with tempfile.TemporaryDirectory() as tmpdir: + self.model.save(tmpdir) + verify_json_path = os.path.join(tmpdir, VERIFY_JSON_FILE_NAME) + + result = os.path.isfile(verify_json_path) + self.assertTrue(result, f"Save verify file failed: {verify_json_path} does not exist.") From 5062291507685f85b250b5772585cbe4e50cf7cb Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 07:37:52 +0000 Subject: [PATCH 2/7] code clean up --- tests/test_tokenicer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_tokenicer.py b/tests/test_tokenicer.py index 41bb89d49..7418addf8 100644 --- a/tests/test_tokenicer.py +++ b/tests/test_tokenicer.py @@ -17,7 +17,6 @@ import os os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" -os.environ["CUDA_VISIBLLE_DEVICES"] = "0" import unittest # noqa: E402 import tempfile From b3c4c093d99b65de4a2012dc16179eac1a6cd9a8 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 08:59:26 +0000 Subject: [PATCH 3/7] update test --- tests/test_tokenicer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_tokenicer.py b/tests/test_tokenicer.py index 7418addf8..1fcf468a4 100644 --- a/tests/test_tokenicer.py +++ b/tests/test_tokenicer.py @@ -23,7 +23,7 @@ from gptqmodel import GPTQModel, QuantizeConfig # noqa: E402 from parameterized import parameterized # noqa: E402 from datasets import load_dataset -from tokenicer.const import VERIFY_JSON_FILE_NAME +from tokenicer.const import VALIDATE_JSON_FILE_NAME class TestTokenicer(unittest.TestCase): @@ -90,7 +90,7 @@ def test_tokenicer_save(self): with tempfile.TemporaryDirectory() as tmpdir: self.model.save(tmpdir) - verify_json_path = os.path.join(tmpdir, VERIFY_JSON_FILE_NAME) + verify_json_path = os.path.join(tmpdir, VALIDATE_JSON_FILE_NAME) result = os.path.isfile(verify_json_path) self.assertTrue(result, f"Save verify file failed: {verify_json_path} does not exist.") From 39c72e09f9645ab0315e8b99494384f77c5d9c48 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 09:11:57 +0000 Subject: [PATCH 4/7] code review --- tests/test_tokenicer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_tokenicer.py b/tests/test_tokenicer.py index 1fcf468a4..5d30cb5e8 100644 --- a/tests/test_tokenicer.py +++ b/tests/test_tokenicer.py @@ -90,7 +90,7 @@ def test_tokenicer_save(self): with tempfile.TemporaryDirectory() as tmpdir: self.model.save(tmpdir) - verify_json_path = os.path.join(tmpdir, VALIDATE_JSON_FILE_NAME) + validate_json_path = os.path.join(tmpdir, VALIDATE_JSON_FILE_NAME) - result = os.path.isfile(verify_json_path) - self.assertTrue(result, f"Save verify file failed: {verify_json_path} does not exist.") + result = os.path.isfile(validate_json_path) + self.assertTrue(result, f"Save verify file failed: {validate_json_path} does not exist.") From 7431c760c3ba29a8d75504a9989de953dd0e0a25 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 09:28:38 +0000 Subject: [PATCH 5/7] update tokenicer test --- tests/test_tokenicer.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tests/test_tokenicer.py b/tests/test_tokenicer.py index 5d30cb5e8..e86ca3316 100644 --- a/tests/test_tokenicer.py +++ b/tests/test_tokenicer.py @@ -23,7 +23,9 @@ from gptqmodel import GPTQModel, QuantizeConfig # noqa: E402 from parameterized import parameterized # noqa: E402 from datasets import load_dataset -from tokenicer.const import VALIDATE_JSON_FILE_NAME +import json +from tokenicer.const import VALIDATE_JSON_FILE_NAME, VALIDATE_ENCODE_PARAMS +from tokenicer.config import ValidateConfig class TestTokenicer(unittest.TestCase): @@ -94,3 +96,19 @@ def test_tokenicer_save(self): result = os.path.isfile(validate_json_path) self.assertTrue(result, f"Save verify file failed: {validate_json_path} does not exist.") + + with open(validate_json_path, 'r', encoding='utf-8') as f: + data = json.loads(f.read()) + + config = ValidateConfig.from_dict(data) + + validate = True + for data in config.data: + input = data.input + tokenized = self.tokenizer.encode_plus(input, **VALIDATE_ENCODE_PARAMS)["input_ids"].tolist()[0] + if data.output != tokenized: + validate = False + break + + self.assertTrue(validate, f"Expected validate='True' but got '{validate}'.") + From 72a4e3ad7b61f68d33ebb2600bd661ad8d9db739 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 10:02:12 +0000 Subject: [PATCH 6/7] code opt --- gptqmodel/models/base.py | 2 -- gptqmodel/models/writer.py | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index b79eef416..1173b406c 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -977,8 +977,6 @@ def save( else: self.save_pretrained(save_dir, **kwargs) - self.tokenizer.save(save_dir) - def compile(self, backend="inductor", mode="max-autotune"): if not self.quantized: logger.warning("model is not quantized, skip compiling...") diff --git a/gptqmodel/models/writer.py b/gptqmodel/models/writer.py index d42a5a819..5bc454659 100644 --- a/gptqmodel/models/writer.py +++ b/gptqmodel/models/writer.py @@ -315,6 +315,7 @@ def save_quantized( if self.tokenizer: self.tokenizer.save_pretrained(save_dir) + self.tokenizer.save(save_dir) # fixed this issue: https://github.com/huggingface/transformers/issues/35832 saved_tokenizer_config = get_tokenizer_config(save_dir) From 5b3681e1d951989a7674ebf6e8e37d620b32d7d9 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 10:08:05 +0000 Subject: [PATCH 7/7] remove useless code --- gptqmodel/models/writer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gptqmodel/models/writer.py b/gptqmodel/models/writer.py index 5bc454659..d42a5a819 100644 --- a/gptqmodel/models/writer.py +++ b/gptqmodel/models/writer.py @@ -315,7 +315,6 @@ def save_quantized( if self.tokenizer: self.tokenizer.save_pretrained(save_dir) - self.tokenizer.save(save_dir) # fixed this issue: https://github.com/huggingface/transformers/issues/35832 saved_tokenizer_config = get_tokenizer_config(save_dir)