diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 6cea73f08b743..c8770df01f16b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -13,7 +13,7 @@ from enum import IntEnum from pathlib import Path from hashlib import sha256 -from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast import math import numpy as np @@ -265,7 +265,7 @@ def write_tensors(self): break for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): - data: np.ndarray # type hint + data: np.ndarray = data # type hint n_dims = len(data.shape) data_dtype = data.dtype data_qtype: gguf.GGMLQuantizationType | None = None @@ -380,7 +380,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) + vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) #self.hparams.get("vocab_size") assert max(tokenizer.vocab.values()) < vocab_size tokpre = self.get_vocab_base_pre(tokenizer) @@ -404,7 +404,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: return tokens, toktypes, tokpre - # NOTE: this function is generated by convert_hf_to_gguf_update.py + # NOTE: this function is generated by convert-hf-to-gguf-update.py # do not modify it manually! # ref: https://github.com/ggerganov/llama.cpp/pull/6920 # Marker: Start get_vocab_base_pre @@ -424,87 +424,18 @@ def get_vocab_base_pre(self, tokenizer) -> str: res = None - # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script + # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script # or pull the latest version of the model from Huggingface # don't edit the hashes manually! - if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": - # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B - res = "llama-bpe" - if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": - # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base - res = "deepseek-llm" - if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": - # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base - res = "deepseek-coder" - if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": - # ref: https://huggingface.co/tiiuae/falcon-7b - res = "falcon" - if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": - # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 - res = "bert-bge" - if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": - # ref: https://huggingface.co/mosaicml/mpt-7b - res = "mpt" - if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34": - # ref: https://huggingface.co/bigcode/starcoder2-3b - res = "starcoder" - if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": - # ref: https://huggingface.co/openai-community/gpt2 - res = "gpt-2" - if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3": - # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b - res = "stablelm2" - if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": - # ref: https://huggingface.co/smallcloudai/Refact-1_6-base - res = "refact" - if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": - # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 - res = "command-r" - if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": - # ref: https://huggingface.co/Qwen/Qwen1.5-7B - res = "qwen2" - if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": - # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf - res = "olmo" - if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": - # ref: https://huggingface.co/databricks/dbrx-base - res = "dbrx" - if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en - res = "jina-v2-en" - if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es - res = "jina-v2-es" - if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de - res = "jina-v2-de" - if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": - # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct - res = "smaug-bpe" - if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360": - # ref: https://huggingface.co/LumiOpen/Poro-34B-chat - res = "poro-chat" - if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code - res = "jina-v2-code" - if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b": - # ref: https://huggingface.co/THUDM/glm-4-9b-chat - res = "chatglm-bpe" - if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee": - # ref: https://huggingface.co/LumiOpen/Viking-7B - res = "viking" - if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901": - # ref: https://huggingface.co/core42/jais-13b - res = "jais" if res is None: logger.warning("\n") logger.warning("**************************************************************************************") logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") logger.warning("** There are 2 possible reasons for this:") - logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") + logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet") logger.warning("** - the pre-tokenization config has changed upstream") - logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") + logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") logger.warning("**") logger.warning(f"** chkhsh: {chkhsh}") @@ -582,23 +513,15 @@ def _set_vocab_qwen(self): special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) special_vocab.add_to_gguf(self.gguf_writer) - def _set_vocab_sentencepiece(self, add_to_gguf=True): - tokens, scores, toktypes = self._create_vocab_sentencepiece() - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def _create_vocab_sentencepiece(self): + def _set_vocab_sentencepiece(self): from sentencepiece import SentencePieceProcessor tokenizer_path = self.dir_model / 'tokenizer.model' + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") @@ -652,7 +575,14 @@ def _create_vocab_sentencepiece(self): scores.append(-1000.0) toktypes.append(SentencePieceTokenTypes.UNUSED) - return tokens, scores, toktypes + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) def _set_vocab_llama_hf(self): vocab = gguf.LlamaHfVocab(self.dir_model) @@ -676,51 +606,6 @@ def _set_vocab_llama_hf(self): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) - def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int): - tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf" - logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") - vocab_reader = gguf.GGUFReader(tokenizer_path, "r") - - default_pre = "mpt" if model_name == "gpt-neox" else "default" - - field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL) - assert field # tokenizer model - self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8")) - - field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE) - self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre) - - field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST) - assert field # token list - self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) - - if model_name == "llama-spm": - field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES) - assert field # token scores - self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) - - field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) - assert field # token types - self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) - - if model_name != "llama-spm": - field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES) - assert field # token merges - self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) - - if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None: - self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0]) - if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None: - self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0]) - if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None: - self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0]) - if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None: - self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0]) - if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None: - self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0]) - if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None: - self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0]) - @Model.register("GPTNeoXForCausalLM") class GPTNeoXModel(Model): @@ -1986,7 +1871,7 @@ def set_gguf_parameters(self): if len(rope_scaling_type) == 0: raise KeyError('Missing the required key rope_scaling.type') - if rope_scaling_type == 'su' or rope_scaling_type == 'longrope': + if rope_scaling_type == 'su': attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0 elif rope_scaling_type == 'yarn': attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0 @@ -2116,7 +2001,7 @@ def set_vocab(self): logger.error(f'Error: Missing {tokenizer_path}') sys.exit(1) - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model = model.ModelProto() sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix @@ -2360,8 +2245,6 @@ def set_vocab(self): special_vocab._set_special_token("eot", 107) special_vocab.add_to_gguf(self.gguf_writer) - self.gguf_writer.add_add_space_prefix(False) - def set_gguf_parameters(self): hparams = self.hparams block_count = hparams["num_hidden_layers"] @@ -2394,71 +2277,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("Gemma2ForCausalLM") -class Gemma2Model(Model): - model_arch = gguf.MODEL_ARCH.GEMMA2 - - def set_vocab(self): - tokens, scores, toktypes = self._create_vocab_sentencepiece() - # hack: This is required so that we can properly use start/end-of-turn for chat template - for i in range(108): - # including , , - toktypes[i] = SentencePieceTokenTypes.CONTROL - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - self.gguf_writer.add_add_space_prefix(False) - - def set_gguf_parameters(self): - hparams = self.hparams - block_count = hparams["num_hidden_layers"] - - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_key_length(hparams["head_dim"]) - self.gguf_writer.add_value_length(hparams["head_dim"]) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_attn_logit_softcapping( - self.hparams["attn_logit_softcapping"] - ) - self.gguf_writer.add_final_logit_softcapping( - self.hparams["final_logit_softcapping"] - ) - self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - - # sanity check - attn_scalar = self.hparams["query_pre_attn_scalar"] - if attn_scalar != hparams["hidden_size"] / hparams["num_attention_heads"]: - raise ValueError("query_pre_attn_scalar must be equal to n_embd / n_head") - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - # lm_head is not used in llama.cpp, while autoawq will include this tensor in model - # To prevent errors, skip loading lm_head.weight. - if name == "lm_head.weight": - logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") - return [] - - # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 - if name.endswith("norm.weight"): - data_torch = data_torch + 1 - - return [(self.map_tensor_name(name), data_torch)] - - @Model.register("Starcoder2ForCausalLM") class StarCoder2Model(Model): model_arch = gguf.MODEL_ARCH.STARCODER2 @@ -2483,7 +2301,39 @@ def set_vocab(self): self._set_vocab_sentencepiece() else: # Use the GPT-NeoX tokenizer when no tokenizer files are present - self._set_vocab_builtin("gpt-neox", vocab_size) + tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf" + logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") + neox_reader = gguf.GGUFReader(tokenizer_path, "r") + + field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL) + self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2") + + field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE) + self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt") + + field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST) + assert field + self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) + assert field + self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES) + assert field + self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID) + self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID) + self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID) + self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID) + self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0) def set_gguf_parameters(self): d_model = self.find_hparam(["hidden_size", "d_model"]) @@ -2635,82 +2485,6 @@ def set_vocab(self, *args, **kwargs): self.gguf_writer.add_add_eos_token(True) -@Model.register("OpenELMForCausalLM") -class OpenELMModel(Model): - model_arch = gguf.MODEL_ARCH.OPENELM - - @staticmethod - def _make_divisible(v: float | int, divisor: int) -> int: - # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38 - new_v = max(divisor, int(v + divisor / 2) // divisor * divisor) - # Make sure that round down does not go down by more than 10%. - if new_v < 0.9 * v: - new_v += divisor - return new_v - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - ffn_multipliers: list[float] = self.hparams["ffn_multipliers"] - ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"] - self._n_embd: int = self.hparams["model_dim"] - self._num_kv_heads: list[int] = self.hparams["num_kv_heads"] - self._num_query_heads: list[int] = self.hparams["num_query_heads"] - self._ffn_dims: list[int] = [ - OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor) - for multiplier in ffn_multipliers - ] - assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) - assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int) - - # Uses the tokenizer from meta-llama/Llama-2-7b-hf - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"]) - - def set_gguf_parameters(self): - n_embd = self._n_embd - head_dim = self.hparams["head_dim"] - rot_pct = 1.0 - assert self.block_count == len(self._num_kv_heads) - assert self.block_count == len(self._num_query_heads) - assert self.block_count == len(self._ffn_dims) - - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length(self.hparams["max_context_length"]) - self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length(self._ffn_dims) - self.gguf_writer.add_head_count(self._num_query_heads) - self.gguf_writer.add_head_count_kv(self._num_kv_heads) - self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"]) - # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30 - self.gguf_writer.add_layer_norm_rms_eps(1e-6) - self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim)) - self.gguf_writer.add_key_length(head_dim) - self.gguf_writer.add_value_length(head_dim) - self.gguf_writer.add_file_type(self.ftype) - - def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: - if "n_layers" in keys: - return self.hparams["num_transformer_layers"] - - return super().find_hparam(keys, optional) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - - # split ff - if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight": - ff_dim = self._ffn_dims[bid] - yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim]) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:]) - return - - yield (self.map_tensor_name(name), data_torch) - - @Model.register("ArcticForCausalLM") class ArcticModel(Model): model_arch = gguf.MODEL_ARCH.ARCTIC @@ -2941,17 +2715,11 @@ def write_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("T5WithLMHeadModel") @Model.register("T5ForConditionalGeneration") -@Model.register("MT5ForConditionalGeneration") -@Model.register("UMT5ForConditionalGeneration") +@Model.register("T5WithLMHeadModel") class T5Model(Model): model_arch = gguf.MODEL_ARCH.T5 - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.shared_token_embeddings_found = False - def set_vocab(self): # to avoid TypeError: Descriptors cannot be created directly # exception when importing sentencepiece_model_pb2 @@ -2959,29 +2727,17 @@ def set_vocab(self): from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model - tokenizer_path = self.dir_model / 'tokenizer.model' - - # many older models use spiece.model tokenizer model filename - if not tokenizer_path.is_file(): - tokenizer_path = self.dir_model / 'spiece.model' + tokenizer_path = self.dir_model / 'spiece.model' if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model = model.ModelProto() sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - - # some models like Pile-T5 family use BPE tokenizer instead of Unigram - if sentencepiece_model.trainer_spec.model_type == 2: # BPE - # assure the tokenizer model file name is correct - assert tokenizer_path.name == 'tokenizer.model' - return self._set_vocab_sentencepiece() - else: - assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) @@ -3051,10 +2807,7 @@ def set_vocab(self): def set_gguf_parameters(self): self.gguf_writer.add_name("T5") - if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: - logger.warning("Couldn't find context length in config.json, assuming default value of 512") - n_ctx = 512 - self.gguf_writer.add_context_length(n_ctx) + self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["d_model"]) self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) self.gguf_writer.add_block_count(self.hparams["num_layers"]) @@ -3070,295 +2823,16 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", - # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored - # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder - # and decoder and ignore the remaining ones. - if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: - if not self.shared_token_embeddings_found: - name = "shared.weight" - self.shared_token_embeddings_found = True - else: - logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") - return [] - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("JAISLMHeadModel") -class JaisModel(Model): - model_arch = gguf.MODEL_ARCH.JAIS - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # SwigLU activation - assert self.hparams["activation_function"] == "swiglu" - # ALiBi position embedding - assert self.hparams["position_embedding_type"] == "alibi" - - # Embeddings scale - self.embeddings_scale = 1.0 - # note: For some JAIS flavors, output is tied to (same as) wte in original model - self.output_is_wte = False - if 'mup_embeddings_scale' in self.hparams: - self.output_is_wte = True # Hack (?) - self.embeddings_scale = self.hparams['mup_embeddings_scale'] - elif 'embeddings_scale' in self.hparams: - self.embeddings_scale = self.hparams['embeddings_scale'] - else: - assert False - - self.width_scale = 1.0 - if 'mup_output_alpha' in self.hparams: - assert 'mup_width_scale' in self.hparams - self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale'] - elif 'width_scale' in self.hparams: - self.width_scale = self.hparams['width_scale'] - else: - assert False - - self.max_alibi_bias = 8.0 - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_block_count(self.hparams["n_layer"]) - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"]) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - tensors: list[tuple[str, Tensor]] = [] - - # we don't need these - if name.endswith((".attn.bias")): - return tensors - - if name.endswith(("relative_pe.slopes")): - # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation) - # Some other models has max_alibi_bias spelled out explicitly in the hyperparams, - # but Jais's PyTorch model simply precalculates the slope values and places them - # in relative_pes.slopes - n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"])) - first_val = float(data_torch[0].item()) - self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) - - return tensors - - if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): - data_torch = data_torch.transpose(1, 0) - - new_name = self.map_tensor_name(name) - - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - tensors.append((new_name, data_torch * self.embeddings_scale)) - if self.output_is_wte: - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale)) - elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): - assert not self.output_is_wte - tensors.append((new_name, data_torch * self.width_scale)) - else: - tensors.append((new_name, data_torch)) - - return tensors - - def write_tensors(self): - super().write_tensors() - self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) - - -@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration") -class ChatGLMModel(Model): - model_arch = gguf.MODEL_ARCH.CHATGLM - - def set_vocab_chatglm3(self): - dir_model = self.dir_model - hparams = self.hparams - tokens: list[bytes] = [] - toktypes: list[int] = [] - scores: list[float] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab())) - assert max(tokenizer.get_vocab().values()) < vocab_size - role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] - special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens - for token_id in range(vocab_size): - piece = tokenizer._convert_id_to_token(token_id) - if token_id == 0: - piece = "" - elif token_id == 1: - piece = "" - elif token_id == 2: - piece = "" - - text = piece.encode("utf-8") - score = 0.0 - # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py), - # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size() - if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): - score = tokenizer.tokenizer.sp_model.get_score(token_id) - - if len(piece) == 0: - text = f"[PAD{token_id}]".encode("utf-8") - - if token_id >= tokenizer.tokenizer.sp_model.vocab_size(): - if piece in special_tokens: - # show special tokens in prompt - toktype = SentencePieceTokenTypes.USER_DEFINED - else: - toktype = SentencePieceTokenTypes.UNKNOWN - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - continue - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.tokenizer.sp_model.is_unknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.tokenizer.sp_model.is_control(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.tokenizer.sp_model.is_unused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.tokenizer.sp_model.is_byte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - self.gguf_writer.add_tokenizer_model("llama") - # glm3 needs prefix and suffix formatted as: - # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>" - self.gguf_writer.add_tokenizer_pre("chatglm-spm") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - @staticmethod - def token_bytes_to_string(b): - from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode - byte_encoder = bytes_to_unicode() - return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) - - @staticmethod - def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: - parts = [bytes([b]) for b in token] - while True: - min_idx = None - min_rank = None - for i, pair in enumerate(zip(parts[:-1], parts[1:])): - rank = mergeable_ranks.get(pair[0] + pair[1]) - if rank is not None and (min_rank is None or rank < min_rank): - min_idx = i - min_rank = rank - if min_rank is None or (max_rank is not None and min_rank >= max_rank): - break - assert min_idx is not None - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] - return parts - - def set_vocab(self): - if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""): - self.set_vocab_chatglm3() - return - - dir_model = self.dir_model - hparams = self.hparams - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams["padded_vocab_size"] - assert max(tokenizer.get_vocab().values()) < vocab_size - - tokpre = self.get_vocab_base_pre(tokenizer) - - merges = [] - vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks - for token, rank in mergeable_ranks.items(): - vocab[ChatGLMModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank) - assert len(merged) >= 2 and len(merged) <= 7 - merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged))) - - # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined - added_vocab = tokenizer.get_added_vocab() - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) - special_vocab.merges = merges - # only add special tokens when they were not already loaded from config.json - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) - # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - self.gguf_writer.add_name(self.hparams["_name_or_path"].split("/")[1]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_head_kv = self.hparams.get("multi_query_group_num", n_head) - self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) - self.gguf_writer.add_embedding_length(n_embed) - self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed)) - self.gguf_writer.add_block_count(self.hparams["num_layers"]) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_rope_dimension_count(64) - self.gguf_writer.add_add_bos_token(False) - rope_freq = 10000 - if "rope_ratio" in self.hparams: - rope_freq = rope_freq * self.hparams["rope_ratio"] - self.gguf_writer.add_rope_freq_base(rope_freq) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - if name.endswith(".rotary_pos_emb.inv_freq"): + # Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or + # "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor + # To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight". + if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight": + logger.debug(f"Skipping tensor {name!r} in safetensors so that convert can end normally.") return [] - name = name.removeprefix("transformer.") return [(self.map_tensor_name(name), data_torch)] + ###### CONVERSION LOGIC ###### @@ -3408,6 +2882,10 @@ def parse_args() -> argparse.Namespace: "--vocab-only", action="store_true", help="extract only the vocab", ) + parser.add_argument( + "--awq-path", type=Path, default=None, + help="Path to scale awq cache file", + ) parser.add_argument( "--outfile", type=Path, help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", @@ -3485,6 +2963,19 @@ def main() -> None: dir_model = args.model + if args.awq_path: + sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) + from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] + tmp_model_path = args.model / "weighted_model" + dir_model = tmp_model_path + if tmp_model_path.is_dir(): + logger.info(f"{tmp_model_path} exists as a weighted model.") + else: + tmp_model_path.mkdir(parents=True, exist_ok=True) + logger.info("Saving new weighted model ...") + add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) + logger.info(f"Saved weighted model at {tmp_model_path}.") + if not dir_model.is_dir(): logger.error(f'Error: {args.model} is not a directory') sys.exit(1) @@ -3497,8 +2988,7 @@ def main() -> None: "auto": gguf.LlamaFileType.GUESSED, } - is_split = args.split_max_tensors > 0 or args.split_max_size != "0" - if args.use_temp_file and is_split: + if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"): logger.error("Error: Cannot use temp file when splitting") sys.exit(1) @@ -3535,12 +3025,11 @@ def main() -> None: if args.vocab_only: logger.info("Exporting model vocab...") model_instance.write_vocab() - logger.info(f"Model vocab successfully exported to {model_instance.fname_out}") + logger.info("Model vocab successfully exported.") else: logger.info("Exporting model...") model_instance.write() - out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out - logger.info(f"Model successfully exported to {out_path}") + logger.info("Model successfully exported.") if __name__ == '__main__': diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index e4165ae2d977c..fe028275674d8 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # This script downloads the tokenizer models of the specified models from Huggingface and -# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py +# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py # # This is necessary in order to analyze the type of pre-tokenizer used by the model and # provide the necessary information to llama.cpp via the GGUF header in order to implement @@ -15,9 +15,9 @@ # - Add a new model to the "models" list # - Run the script with your huggingface token: # -# python3 convert_hf_to_gguf_update.py +# python3 convert-hf-to-gguf-update.py # -# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py +# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py # - Update llama.cpp with the new pre-tokenizer if necessary # # TODO: generate tokenizer tests for llama.cpp @@ -27,6 +27,7 @@ import os import pathlib import re +import time import requests import sys @@ -37,15 +38,17 @@ from transformers import AutoTokenizer logging.basicConfig(level=logging.DEBUG) -logger = logging.getLogger("convert_hf_to_gguf_update") +logger = logging.getLogger("convert-hf-to-gguf-update") sess = requests.Session() +# User input for new model +new_name = input("Enter the name of the new model: ") +new_url = input("Enter the URL of the new model: ") class TOKENIZER_TYPE(IntEnum): SPM = auto() BPE = auto() WPM = auto() - UGM = auto() # TODO: this string has to exercise as much pre-tokenizer functionality as possible @@ -56,42 +59,21 @@ class TOKENIZER_TYPE(IntEnum): token = sys.argv[1] if not token.startswith("hf_"): logger.info("Huggingface token seems invalid") - logger.info("Usage: python convert_hf_to_gguf_update.py ") + logger.info("Usage: python convert-hf-to-gguf-update.py ") sys.exit(1) else: - logger.info("Usage: python convert_hf_to_gguf_update.py ") + logger.info("Usage: python convert-hf-to-gguf-update.py ") sys.exit(1) # TODO: add models here, base models preferred -models = [ - {"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", }, - {"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", }, - {"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", }, - {"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", }, - {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, - {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, - {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, - {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, - {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, - {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, - {"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", }, - {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", }, - {"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", }, - {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", }, - {"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", }, - {"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", }, - {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM! - {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, - {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, - {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, - {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", }, - {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, - {"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B - {"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", }, - {"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", }, - {"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", }, - {"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", }, -] +models = [] + +# Construct new entry and add to models list +new_entry = {"name": new_name, "tokt": TOKENIZER_TYPE.BPE, "repo": new_url} +models.append(new_entry) +print('Model added...') +print(models) +time.sleep(15) def download_file_with_auth(url, token, save_path): @@ -112,13 +94,9 @@ def download_model(model): os.makedirs(f"models/tokenizers/{name}", exist_ok=True) files = ["config.json", "tokenizer.json", "tokenizer_config.json"] - if tokt == TOKENIZER_TYPE.SPM: files.append("tokenizer.model") - if tokt == TOKENIZER_TYPE.UGM: - files.append("spiece.model") - for file in files: save_path = f"models/tokenizers/{name}/{file}" if os.path.isfile(save_path): @@ -134,14 +112,14 @@ def download_model(model): logger.error(f"Failed to download model {model['name']}. Error: {e}") -# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function: +# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function: src_ifs = "" for model in models: name = model["name"] tokt = model["tokt"] - if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM: + if tokt == TOKENIZER_TYPE.SPM: continue # Skip if the tokenizer folder does not exist or there are other download issues previously @@ -151,10 +129,7 @@ def download_model(model): # create the tokenizer try: - if name == "t5": - tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False) - else: - tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") + tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") except OSError as e: logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}") continue # Skip to the next model if the tokenizer can't be loaded @@ -201,7 +176,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: res = None - # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script + # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script # or pull the latest version of the model from Huggingface # don't edit the hashes manually! {src_ifs} @@ -210,9 +185,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: logger.warning("**************************************************************************************") logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") logger.warning("** There are 2 possible reasons for this:") - logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") + logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet") logger.warning("** - the pre-tokenization config has changed upstream") - logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") + logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") logger.warning("**") logger.warning(f"** chkhsh: {{chkhsh}}") @@ -226,7 +201,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: return res """ -convert_py_pth = pathlib.Path("convert_hf_to_gguf.py") +convert_py_pth = pathlib.Path("convert-hf-to-gguf.py") convert_py = convert_py_pth.read_text(encoding="utf-8") convert_py = re.sub( r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)", @@ -237,7 +212,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: convert_py_pth.write_text(convert_py, encoding="utf-8") -logger.info("+++ convert_hf_to_gguf.py was updated") +logger.info("+++ convert-hf-to-gguf.py was updated") # generate tests for each tokenizer model @@ -275,7 +250,6 @@ def get_vocab_base_pre(self, tokenizer) -> str: "\n =", "' era", "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", - "!!!!!!", "3", "33", "333", @@ -285,8 +259,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: "3333333", "33333333", "333333333", - "Cửa Việt", # llama-bpe fails on this - " discards", + # "Cửa Việt", # llama-bpe fails on this chktxt, ] @@ -314,10 +287,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: # create the tokenizer try: - if name == "t5": - tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False) - else: - tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") + tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}",) except OSError as e: logger.error(f"Failed to load tokenizer for model {name}. Error: {e}") continue # Skip this model and continue with the next one in the loop @@ -343,6 +313,6 @@ def get_vocab_base_pre(self, tokenizer) -> str: for model in models: name = model["name"] - print(f"python3 convert_hf_to_gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100 + print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100 logger.info("\n")