From d383c0d818bc365c72fef79d89ae3d0a19ed931d Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Fri, 12 Apr 2024 02:17:50 -0700 Subject: [PATCH 01/25] StableLM2 12B support for huggingface -> GGUF --- convert-hf-to-gguf.py | 811 ++++++++++++++++++++++++++++++++---------- 1 file changed, 629 insertions(+), 182 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index e1ac09e024b11..b037c50cb2000 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -11,7 +11,16 @@ from abc import ABC, abstractmethod from enum import IntEnum from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast +from typing import ( + TYPE_CHECKING, + Any, + Callable, + ContextManager, + Iterator, + Sequence, + TypeVar, + cast, +) import numpy as np import torch @@ -19,15 +28,15 @@ if TYPE_CHECKING: from torch import Tensor -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) +if "NO_LOCAL_GGUF" not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / "gguf-py")) import gguf from convert import LlamaHfVocab, permute - ###### MODEL DEFINITIONS ###### + class SentencePieceTokenTypes(IntEnum): NORMAL = 1 UNKNOWN = 2 @@ -43,18 +52,31 @@ class SentencePieceTokenTypes(IntEnum): class Model(ABC): _model_classes: dict[str, type[Model]] = {} - def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool): + def __init__( + self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool + ): self.dir_model = dir_model self.ftype = ftype self.fname_out = fname_out self.is_big_endian = is_big_endian - self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + self.endianess = ( + gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + ) self.is_safetensors = self._is_model_safetensors() - self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") + self.num_parts = Model.count_model_parts( + self.dir_model, ".safetensors" if self.is_safetensors else ".bin" + ) self.part_names = self._get_part_names() self.hparams = Model.load_hparams(self.dir_model) - self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False) - self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) + self.gguf_writer = gguf.GGUFWriter( + fname_out, + gguf.MODEL_ARCH_NAMES[self.model_arch], + endianess=self.endianess, + use_temp_file=False, + ) + self.block_count = self.find_hparam( + ["n_layers", "num_hidden_layers", "n_layer"] + ) @property @abstractmethod @@ -78,20 +100,39 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: ctx: ContextManager[Any] if self.is_safetensors: from safetensors import safe_open - ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) + + ctx = cast( + ContextManager[Any], + safe_open(self.dir_model / part_name, framework="pt", device="cpu"), + ) else: - ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) + ctx = contextlib.nullcontext( + torch.load( + str(self.dir_model / part_name), + map_location="cpu", + mmap=True, + weights_only=True, + ) + ) with ctx as model_part: for name in model_part.keys(): - data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] + data = ( + model_part.get_tensor(name) + if self.is_safetensors + else model_part[name] + ) yield name, data def set_gguf_parameters(self): self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_block_count(self.block_count) - if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: + if ( + n_ctx := self.find_hparam( + ["max_position_embeddings", "n_ctx"], optional=True + ) + ) is not None: self.gguf_writer.add_context_length(n_ctx) print(f"gguf: context length = {n_ctx}") @@ -99,7 +140,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(n_embd) print(f"gguf: embedding length = {n_embd}") - if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: + if ( + n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True) + ) is not None: self.gguf_writer.add_feed_forward_length(n_ff) print(f"gguf: feed forward length = {n_ff}") @@ -117,7 +160,11 @@ def set_gguf_parameters(self): if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) print(f"gguf: rms norm epsilon = {f_rms_eps}") - if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: + if ( + f_norm_eps := self.find_hparam( + ["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True + ) + ) is not None: self.gguf_writer.add_layer_norm_eps(f_norm_eps) print(f"gguf: layer norm epsilon = {f_norm_eps}") if (n_experts := self.hparams.get("num_local_experts")) is not None: @@ -131,11 +178,20 @@ def set_gguf_parameters(self): print(f"gguf: file type = {self.ftype}") def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + block_count = self.hparams.get( + "n_layers", + self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), + ) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + if name.endswith( + ( + ".attention.masked_bias", + ".attention.bias", + ".attention.rotary_emb.inv_freq", + ) + ): continue old_dtype = data_torch.dtype @@ -160,11 +216,21 @@ def write_tensors(self): data = data.astype(np.float32) # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")): + if ( + self.ftype == 1 + and data_dtype == np.float16 + and (n_dims == 1 or new_name.endswith("_norm.weight")) + ): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and not new_name.endswith("_norm.weight") + and n_dims == 2 + ): data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -205,6 +271,7 @@ def func(modelcls: type[Model]): for name in names: cls._model_classes[name] = modelcls return modelcls + return func @classmethod @@ -212,7 +279,7 @@ def from_model_architecture(cls, arch): try: return cls._model_classes[arch] except KeyError: - raise NotImplementedError(f'Architecture {arch!r} not supported!') from None + raise NotImplementedError(f"Architecture {arch!r} not supported!") from None def _is_model_safetensors(self) -> bool: return Model.count_model_parts(self.dir_model, ".safetensors") > 0 @@ -221,11 +288,17 @@ def _get_part_names(self): if self.is_safetensors: if self.num_parts == 1: # there's only one .safetensors file return ("model.safetensors",) - return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1)) + return ( + f"model-{n:05}-of-{self.num_parts:05}.safetensors" + for n in range(1, self.num_parts + 1) + ) if self.num_parts == 1: # there's only one .bin file return ("pytorch_model.bin",) - return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) + return ( + f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" + for n in range(1, self.num_parts + 1) + ) # used for GPT-2 BPE and WordPiece vocabs def get_basic_vocab(self) -> tuple[list[str], list[int]]: @@ -233,11 +306,14 @@ def get_basic_vocab(self) -> tuple[list[str], list[int]]: toktypes: list[int] = [] from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} + reverse_vocab = { + id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items() + } added_vocab = tokenizer.get_added_vocab() for i in range(vocab_size): @@ -272,6 +348,7 @@ def _set_vocab_qwen(self): toktypes: list[int] = [] from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) vocab_size = hparams["vocab_size"] assert max(tokenizer.get_vocab().values()) < vocab_size @@ -285,11 +362,13 @@ def _set_vocab_qwen(self): continue merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) assert len(merged) == 2 - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + merges.append(" ".join(map(QwenModel.token_bytes_to_string, merged))) # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined added_vocab = tokenizer.special_tokens - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()} + reverse_vocab = { + id_: encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items() + } for i in range(vocab_size): if i not in reverse_vocab: @@ -310,16 +389,22 @@ def _set_vocab_qwen(self): special_vocab.merges = merges # only add special tokens when they were not already loaded from config.json if len(special_vocab.special_token_ids) == 0: - special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) - special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab._set_special_token( + "bos", tokenizer.special_tokens["<|endoftext|>"] + ) + special_vocab._set_special_token( + "eos", tokenizer.special_tokens["<|endoftext|>"] + ) # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab._set_special_token( + "unk", tokenizer.special_tokens["<|endoftext|>"] + ) special_vocab.add_to_gguf(self.gguf_writer) def _set_vocab_sentencepiece(self): from sentencepiece import SentencePieceProcessor - tokenizer_path = self.dir_model / 'tokenizer.model' + tokenizer_path = self.dir_model / "tokenizer.model" tokens: list[bytes] = [] scores: list[float] = [] @@ -329,7 +414,7 @@ def _set_vocab_sentencepiece(self): raise FileNotFoundError(f"File not found: {tokenizer_path}") tokenizer = SentencePieceProcessor(str(tokenizer_path)) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size()) for token_id in range(tokenizer.vocab_size()): piece = tokenizer.id_to_piece(token_id) @@ -350,7 +435,7 @@ def _set_vocab_sentencepiece(self): scores.append(score) toktypes.append(toktype) - added_tokens_file = self.dir_model / 'added_tokens.json' + added_tokens_file = self.dir_model / "added_tokens.json" if added_tokens_file.is_file(): with open(added_tokens_file, "r", encoding="utf-8") as f: added_tokens_json = json.load(f) @@ -407,10 +492,15 @@ def set_gguf_parameters(self): self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_rope_dimension_count( - int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), + int( + self.hparams["rotary_pct"] + * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + ), ) self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) + self.gguf_writer.add_parallel_residual( + self.hparams.get("use_parallel_residual", True) + ) self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) @@ -440,10 +530,13 @@ def write_tensors(self): n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) for name, data_torch in tensors.items(): - if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys(): + if ( + "lm_head.weight" not in tensors.keys() + and "output.weight" not in tensors.keys() + ): has_lm_head = False - name = re.sub(r'transformer\.', '', name) + name = re.sub(r"transformer\.", "", name) old_dtype = data_torch.dtype @@ -497,7 +590,12 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): data = data.astype(np.float16) print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}") @@ -506,7 +604,10 @@ def write_tensors(self): if not has_lm_head and name == "word_embeddings.weight": self.gguf_writer.add_tensor("output.weight", data) - print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") + print( + name, + f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}", + ) @Model.register("MPTForCausalLM") @@ -538,16 +639,26 @@ def set_gguf_parameters(self): if self.hparams["attn_config"]["clip_qkv"] is not None: self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) if self.hparams["attn_config"]["alibi"]: - self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + self.gguf_writer.add_max_alibi_bias( + self.hparams["attn_config"]["alibi_bias_max"] + ) else: self.gguf_writer.add_max_alibi_bias(0.0) def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers")) + block_count = self.hparams.get( + "n_layers", self.hparams.get("num_hidden_layers") + ) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + if name.endswith( + ( + ".attention.masked_bias", + ".attention.bias", + ".attention.rotary_emb.inv_freq", + ) + ): continue old_dtype = data_torch.dtype @@ -560,7 +671,9 @@ def write_tensors(self): # map tensor names if "scales" in name: - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales")) + new_name = tensor_map.get_name( + name, try_suffixes=(".weight", ".bias", ".scales") + ) if new_name is not None: new_name = new_name.replace("scales", "act.scales") else: @@ -581,7 +694,12 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -664,10 +782,17 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): data = data.astype(np.float16) - print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + print( + f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}" + ) self.gguf_writer.add_tensor(new_name, data) @@ -702,15 +827,22 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_rope_dimension_count( + self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + ) self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if ( + self.hparams.get("rope_scaling") is not None + and "factor" in self.hparams["rope_scaling"] + ): if self.hparams["rope_scaling"].get("type") == "linear": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + self.gguf_writer.add_rope_scaling_factor( + self.hparams["rope_scaling"]["factor"] + ) def write_tensors(self): # Collect tensors from generator object @@ -721,14 +853,19 @@ def write_tensors(self): head_count_kv = self.hparams.get("num_key_value_heads", head_count) for i in range(block_count): - if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None: + if ( + w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight") + ) is not None: print(f"Unpacking and permuting layer {i}") - model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \ - self._reverse_hf_permute_part(w, 0, head_count, head_count) - model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \ - self._reverse_hf_permute_part(w, 1, head_count, head_count_kv) - model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \ - self._reverse_hf_part(w, 2) + model_kv[ + f"model.layers.{i}.self_attn.q_proj.weight" + ] = self._reverse_hf_permute_part(w, 0, head_count, head_count) + model_kv[ + f"model.layers.{i}.self_attn.k_proj.weight" + ] = self._reverse_hf_permute_part(w, 1, head_count, head_count_kv) + model_kv[ + f"model.layers.{i}.self_attn.v_proj.weight" + ] = self._reverse_hf_part(w, 2) del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"] for name, data_torch in model_kv.items(): @@ -762,31 +899,48 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): data = data.astype(np.float16) - print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + print( + f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}" + ) self.gguf_writer.add_tensor(new_name, data) - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + def _reverse_hf_permute( + self, weights: Tensor, n_head: int, n_kv_head: int | None = None + ) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + weights.reshape( + n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] + ) .swapaxes(1, 2) .reshape(weights.shape) ) def _reverse_hf_permute_part( - self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, + self, + weights: Tensor, + n_part: int, + n_head: int, + n_head_kv: int | None = None, ) -> Tensor: r = weights.shape[0] // 3 - return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) + return self._reverse_hf_permute( + weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv + ) def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: r = weights.shape[0] // 3 - return weights[r * n_part:r * n_part + r, ...] + return weights[r * n_part : r * n_part + r, ...] @Model.register("XverseForCausalLM") @@ -802,20 +956,23 @@ def set_vocab(self): toktypes: list[int] = [] from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} + reverse_vocab = { + id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items() + } added_vocab = tokenizer.get_added_vocab() for token_id in range(vocab_size): - token_text = reverse_vocab[token_id].encode('utf-8') + token_text = reverse_vocab[token_id].encode("utf-8") # replace "\x00" to string with length > 0 if token_text == b"\x00": toktype = gguf.TokenType.BYTE # special - token_text = f"<{token_text}>".encode('utf-8') - elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): + token_text = f"<{token_text}>".encode("utf-8") + elif re.fullmatch(rb"<0x[0-9A-Fa-f]{2}>", token_text): toktype = gguf.TokenType.BYTE # special elif reverse_vocab[token_id] in added_vocab: if tokenizer.added_tokens_decoder[token_id].special: @@ -859,15 +1016,22 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_rope_dimension_count( + self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + ) self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if ( + self.hparams.get("rope_scaling") is not None + and "factor" in self.hparams["rope_scaling"] + ): if self.hparams["rope_scaling"].get("type") == "linear": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + self.gguf_writer.add_rope_scaling_factor( + self.hparams["rope_scaling"]["factor"] + ) def write_tensors(self): # Collect tensors from generator object @@ -890,9 +1054,13 @@ def write_tensors(self): # HF models permute some of the tensors, so we need to undo that if name.endswith(("q_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, head_count, head_count) + data_torch = self._reverse_hf_permute( + data_torch, head_count, head_count + ) if name.endswith(("k_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv) + data_torch = self._reverse_hf_permute( + data_torch, head_count, head_count_kv + ) data = data_torch.squeeze().numpy() @@ -914,18 +1082,29 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): data = data.astype(np.float16) - print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + print( + f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}" + ) self.gguf_writer.add_tensor(new_name, data) - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + def _reverse_hf_permute( + self, weights: Tensor, n_head: int, n_kv_head: int | None = None + ) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + weights.reshape( + n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] + ) .swapaxes(1, 2) .reshape(weights.shape) ) @@ -993,7 +1172,9 @@ def write_tensors(self): # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py if "query_key_value" in name: - qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) + qkv = data_torch.view( + n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head + ) q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) @@ -1019,7 +1200,12 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -1086,13 +1272,19 @@ def write_tensors(self): tensors = dict(self.get_tensors()) for i in range(block_count): if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None: - tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim] - tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:] + tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[ + : n_head_kv * head_dim + ] + tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[ + n_head_kv * head_dim : + ] del tensors[f"transformer.h.{i}.attn.kv.weight"] if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None: tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w del tensors[f"transformer.h.{i}.attn.q.weight"] - if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None: + if ( + w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight") + ) is not None: tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim] tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:] del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"] @@ -1124,7 +1316,12 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -1137,12 +1334,14 @@ class PersimmonModel(Model): model_arch = gguf.MODEL_ARCH.PERSIMMON def set_gguf_parameters(self): - block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) + block_count = self.hparams.get( + "num_layers", self.hparams.get("num_hidden_layers") + ) head_count = self.hparams["num_attention_heads"] head_count_kv = head_count hidden_size = self.hparams["hidden_size"] - self.gguf_writer.add_name('persimmon-8b-chat') + self.gguf_writer.add_name("persimmon-8b-chat") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hidden_size) self.gguf_writer.add_block_count(block_count) @@ -1165,7 +1364,9 @@ def set_vocab(self): # self.gguf_writer.add_eos_token_id(71013) def write_tensors(self): - block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) + block_count = self.hparams.get( + "num_layers", self.hparams.get("num_hidden_layers") + ) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data_torch in self.get_tensors(): @@ -1183,18 +1384,35 @@ def write_tensors(self): self.gguf_writer.add_tensor(new_name, data) -@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") -class StableLMModel(Model): +@Model.register( + "StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM" +) +class StableLM2Model(Model): model_arch = gguf.MODEL_ARCH.STABLELM + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + model_arch = ( + gguf.MODEL_ARCH.STABLELM + if self.hparams["num_hidden_layers"] < 40 + else gguf.MODEL_ARCH.STABLELM2 + ) + self.gguf_writer = gguf.GGUFWriter( + self.fname_out, + gguf.MODEL_ARCH_NAMES[model_arch], + endianess=self.endianess, + use_temp_file=False, + ) + def set_vocab(self): if (self.dir_model / "tokenizer.json").is_file(): self._set_vocab_gpt2() else: - # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab + # StableLM 2 uses a vocab in a similar format to Qwen's vocab self._set_vocab_qwen() def set_gguf_parameters(self): + super().set_gguf_parameters() hparams = self.hparams block_count = hparams["num_hidden_layers"] @@ -1204,10 +1422,22 @@ def set_gguf_parameters(self): self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) - self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + self.gguf_writer.add_rope_dimension_count( + int( + rotary_factor + * (hparams["hidden_size"] // hparams["num_attention_heads"]) + ) + ) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) - self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) + self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) + self.gguf_writer.add_parallel_residual( + hparams["use_parallel_residual"] + if "use_parallel_residual" in hparams + else True + ) + self.gguf_writer.add_layer_norm_eps( + self.find_hparam(["layer_norm_eps", "norm_eps"]) + ) @Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") @@ -1216,7 +1446,7 @@ class LlamaModel(Model): def set_vocab(self): try: - self. _set_vocab_sentencepiece() + self._set_vocab_sentencepiece() except FileNotFoundError: self._set_vocab_llama_hf() @@ -1224,11 +1454,16 @@ def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) + self.gguf_writer.add_rope_dimension_count( + hparams["hidden_size"] // hparams["num_attention_heads"] + ) # Same as super class, but permuting q_proj, k_proj def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + block_count = self.hparams.get( + "n_layers", + self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), + ) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) n_head = self.hparams.get("num_attention_heads") n_kv_head = self.hparams.get("num_key_value_heads") @@ -1236,7 +1471,13 @@ def write_tensors(self): experts = dict() for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + if name.endswith( + ( + ".attention.masked_bias", + ".attention.bias", + ".attention.rotary_emb.inv_freq", + ) + ): continue old_dtype = data_torch.dtype @@ -1285,14 +1526,20 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32: data = data.astype(np.float16) - merged_name = f"layers.{bid}.feed_forward.experts.w{wid}.weight" + merged_name = ( + f"layers.{bid}.feed_forward.experts.w{wid}.weight" + ) - new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) + new_name = tensor_map.get_name( + merged_name, try_suffixes=(".weight", ".bias") + ) if new_name is None: print(f"Can not map tensor {name!r}") sys.exit() - print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") + print( + f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}" + ) self.gguf_writer.add_tensor(new_name, data) continue @@ -1315,7 +1562,12 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -1341,13 +1593,22 @@ def set_gguf_parameters(self): self.gguf_writer.add_name("Grok") def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + block_count = self.hparams.get( + "n_layers", + self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), + ) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) n_experts = self.hparams.get("num_local_experts") experts = dict() for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + if name.endswith( + ( + ".attention.masked_bias", + ".attention.bias", + ".attention.rotary_emb.inv_freq", + ) + ): continue old_dtype = data_torch.dtype @@ -1389,14 +1650,20 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32: data = data.astype(np.float16) - merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight" + merged_name = ( + f"transformer.decoder_layer.{bid}.moe.{wid}.weight" + ) - new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) + new_name = tensor_map.get_name( + merged_name, try_suffixes=(".weight", ".bias") + ) if new_name is None: print(f"Can not map tensor {name!r}") sys.exit() - print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") + print( + f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}" + ) self.gguf_writer.add_tensor(new_name, data) continue @@ -1419,7 +1686,12 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -1534,7 +1806,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_rope_dimension_count( + self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + ) self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) @@ -1543,24 +1817,37 @@ def set_gguf_parameters(self): def set_vocab(self): self._set_vocab_llama_hf() - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + def _reverse_hf_permute( + self, weights: Tensor, n_head: int, n_kv_head: int | None = None + ) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + weights.reshape( + n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] + ) .swapaxes(1, 2) .reshape(weights.shape) ) def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + block_count = self.hparams.get( + "n_layers", + self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), + ) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) n_head = self.hparams.get("num_attention_heads") n_kv_head = self.hparams.get("num_key_value_heads") for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + if name.endswith( + ( + ".attention.masked_bias", + ".attention.bias", + ".attention.rotary_emb.inv_freq", + ) + ): continue old_dtype = data_torch.dtype @@ -1595,7 +1882,12 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -1610,11 +1902,14 @@ class QwenModel(Model): @staticmethod def token_bytes_to_string(b): from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode + byte_encoder = bytes_to_unicode() - return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) + return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")]) @staticmethod - def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: + def bpe( + mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None + ) -> list[bytes]: parts = [bytes([b]) for b in token] while True: min_idx = None @@ -1627,7 +1922,11 @@ def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = if min_rank is None or (max_rank is not None and min_rank >= max_rank): break assert min_idx is not None - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] + parts = ( + parts[:min_idx] + + [parts[min_idx] + parts[min_idx + 1]] + + parts[min_idx + 2 :] + ) return parts def set_vocab(self): @@ -1640,7 +1939,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_rope_dimension_count( + self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + ) self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) @@ -1679,14 +1980,19 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) -@Model.register("Qwen2ForCausalLM") +@Model.register("wen2ForCausalLM") class Qwen2Model(Model): model_arch = gguf.MODEL_ARCH.QWEN2 @@ -1706,15 +2012,28 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + block_count = self.hparams.get( + "n_layers", + self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), + ) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")): + if name.endswith( + ( + ".attention.masked_bias", + ".attention.bias", + ".attention.rotary_emb.inv_freq", + ".attn.bias", + ".attn.masked_bias", + ) + ): continue - if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): + if name.endswith( + (".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight") + ): data_torch = data_torch.transpose(1, 0) old_dtype = data_torch.dtype @@ -1743,7 +2062,12 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -1768,14 +2092,18 @@ def set_gguf_parameters(self): n_head = self.find_hparam(["num_attention_heads", "n_head"]) self.gguf_writer.add_name("Phi2") - self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) + self.gguf_writer.add_context_length( + self.find_hparam(["n_positions", "max_position_embeddings"]) + ) self.gguf_writer.add_embedding_length(n_embd) self.gguf_writer.add_feed_forward_length(4 * n_embd) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"])) + self.gguf_writer.add_layer_norm_eps( + self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]) + ) self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_add_bos_token(False) @@ -1798,7 +2126,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong + self.gguf_writer.add_head_count_kv( + 5 + ) # hparams["num_key_value_heads"]) is wrong self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) def shuffle_attn_q_weight(self, data_torch): @@ -1816,7 +2146,9 @@ def shuffle_attn_output_weight(self, data_torch): return data_torch def write_tensors(self): - block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) + block_count = self.hparams.get( + "num_layers", self.hparams.get("num_hidden_layers") + ) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data_torch in self.get_tensors(): @@ -1855,7 +2187,12 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -1884,10 +2221,15 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_factor(1.0) def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + block_count = self.hparams.get( + "n_layers", + self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), + ) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) tensors = dict(self.get_tensors()) - has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys() + has_lm_head = ( + "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys() + ) for name, data_torch in tensors.items(): # we don't need these if name.endswith((".attn.rotary_emb.inv_freq")): @@ -1919,7 +2261,12 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -1928,7 +2275,10 @@ def write_tensors(self): if not has_lm_head and name == "transformer.wte.weight": self.gguf_writer.add_tensor("output.weight", data) - print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") + print( + name, + f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}", + ) @Model.register("InternLM2ForCausalLM") @@ -1943,14 +2293,14 @@ def set_vocab(self): from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model - tokenizer_path = self.dir_model / 'tokenizer.model' + tokenizer_path = self.dir_model / "tokenizer.model" tokens: list[bytes] = [] scores: list[float] = [] toktypes: list[int] = [] if not tokenizer_path.is_file(): - print(f'Error: Missing {tokenizer_path}', file=sys.stderr) + print(f"Error: Missing {tokenizer_path}", file=sys.stderr) sys.exit(1) sentencepiece_model = model.ModelProto() @@ -1958,7 +2308,7 @@ def set_vocab(self): add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix tokenizer = SentencePieceProcessor(str(tokenizer_path)) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size()) for token_id in range(vocab_size): piece = tokenizer.id_to_piece(token_id) @@ -1984,7 +2334,7 @@ def set_vocab(self): scores.append(score) toktypes.append(toktype) - added_tokens_file = self.dir_model / 'added_tokens.json' + added_tokens_file = self.dir_model / "added_tokens.json" if added_tokens_file.is_file(): with open(added_tokens_file, "r", encoding="utf-8") as f: added_tokens_json = json.load(f) @@ -2005,14 +2355,16 @@ def set_vocab(self): if "chat" in os.path.basename(self.dir_model.absolute()): # For the chat model, we replace the eos with '<|im_end|>'. special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) - print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ -in chat mode so that the conversation can end normally.") + print( + f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ +in chat mode so that the conversation can end normally." + ) special_vocab.add_to_gguf(self.gguf_writer) def _try_get_sft_eos(self, tokenizer): - unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]') - im_end_list = tokenizer.encode('<|im_end|>') + unused_145_list = tokenizer.encode("[UNUSED_TOKEN_145]") + im_end_list = tokenizer.encode("<|im_end|>") assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1) if len(unused_145_list) == 1: eos_token = unused_145_list[0] @@ -2023,9 +2375,13 @@ def _try_get_sft_eos(self, tokenizer): def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): if n_head_kv is not None and n_head != n_head_kv: n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) + return ( + weights.reshape( + n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] + ) + .swapaxes(1, 2) + .reshape(weights.shape) + ) def set_gguf_parameters(self): self.gguf_writer.add_name("InternLM2") @@ -2065,7 +2421,12 @@ def post_write_tensors(self, tensor_map, name, data_torch): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -2093,15 +2454,35 @@ def write_tensors(self): if re.match(qkv_pattern, name): bid = re.findall(qkv_pattern, name)[0] qkv = data_torch - qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) - q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] + qkv = rearrange( + qkv.T, + " o (g n i) ->o g n i", + g=num_groups, + n=q_per_kv + 2, + i=head_dim, + ) + q, k, v = ( + qkv[..., :q_per_kv, :], + qkv[..., q_per_kv : q_per_kv + 1, :], + qkv[..., q_per_kv + 1 : q_per_kv + 2, :], + ) # The model weights of q and k equire additional reshape. - q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads) - k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads) + q = self._hf_permute_qk( + rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads + ) + k = self._hf_permute_qk( + rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads + ) v = rearrange(v, " o g n i -> o (g n i)").T - self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q) - self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k) - self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v) + self.post_write_tensors( + tensor_map, f"model.layers.{bid}.attention.wq.weight", q + ) + self.post_write_tensors( + tensor_map, f"model.layers.{bid}.attention.wk.weight", k + ) + self.post_write_tensors( + tensor_map, f"model.layers.{bid}.attention.wv.weight", v + ) else: self.post_write_tensors(tensor_map, name, data_torch) @@ -2131,7 +2512,9 @@ def set_gguf_parameters(self): # get pooling type if pooling_path is not None: - with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f: + with open( + self.dir_model / pooling_path / "config.json", encoding="utf-8" + ) as f: pooling = json.load(f) if pooling["pooling_mode_mean_tokens"]: pooling_type = gguf.PoolingType.MEAN @@ -2156,6 +2539,7 @@ def phantom(tok): if tok.startswith("##"): return tok[2:] return "\u2581" + tok + tokens = list(map(phantom, tokens)) # add vocab to gguf @@ -2172,7 +2556,11 @@ def write_tensors(self): tensors = dict(self.get_tensors()) for name, data_torch in tensors.items(): # we are only using BERT for embeddings so we don't need the pooling layer - if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): + if name in ( + "embeddings.position_ids", + "pooler.dense.weight", + "pooler.dense.bias", + ): continue # we don't need these # map tensor names @@ -2186,8 +2574,11 @@ def write_tensors(self): new_dtype: type[np.floating[Any]] if ( - self.ftype == 1 and name.endswith(".weight") and n_dims == 2 - and name != "embeddings.token_type_embeddings.weight" # not used with get_rows, must be F32 + self.ftype == 1 + and name.endswith(".weight") + and n_dims == 2 + and name + != "embeddings.token_type_embeddings.weight" # not used with get_rows, must be F32 ): # if f16 desired, convert any float32 2-dim weight tensors to float16 new_dtype = np.float16 @@ -2250,14 +2641,21 @@ def set_gguf_parameters(self): self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv( + self.hparams["num_key_value_heads"] + if "num_key_value_heads" in hparams + else hparams["num_attention_heads"] + ) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_key_length(hparams["head_dim"]) self.gguf_writer.add_value_length(hparams["head_dim"]) self.gguf_writer.add_file_type(self.ftype) def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + block_count = self.hparams.get( + "n_layers", + self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), + ) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data_torch in self.get_tensors(): @@ -2284,7 +2682,12 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -2315,17 +2718,25 @@ def set_vocab(self): else: # Use the GPT-NeoX tokenizer when no tokenizer files are present tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf" - print(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") + print( + f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'" + ) neox_reader = gguf.GGUFReader(tokenizer_path, "r") field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL) self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1])) field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST) - self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) + self.gguf_writer.add_token_list( + [bytes(field.parts[i]) for i in field.data][:vocab_size] + ) field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) - self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) + self.gguf_writer.add_token_types( + [field.parts[i].tolist()[0] for i in field.data][:vocab_size] + ) field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES) - self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) + self.gguf_writer.add_token_merges( + [bytes(field.parts[i]) for i in field.data] + ) field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID) self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0]) field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID) @@ -2335,23 +2746,37 @@ def set_vocab(self): def set_gguf_parameters(self): d_model = self.find_hparam(["hidden_size", "d_model"]) - d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 - d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model + d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 + d_inner = ( + self.find_hparam(["intermediate_size", "d_inner"], optional=True) + or 2 * d_model + ) d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 # ceiling division # ref: https://stackoverflow.com/a/17511341/22827863 # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 - dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) - rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 + dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -( + d_model // -16 + ) + rms_norm_eps = ( + self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) + or 1e-5 + ) # Fail early for models which don't have a block expansion factor of 2 assert d_inner == 2 * d_model self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default + self.gguf_writer.add_context_length( + 2**20 + ) # arbitrary value; for those who use the default self.gguf_writer.add_embedding_length(d_model) - self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading - self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading + self.gguf_writer.add_feed_forward_length( + 0 + ) # unused, but seemingly required when loading + self.gguf_writer.add_head_count( + 0 + ) # unused, but seemingly required when loading self.gguf_writer.add_block_count(self.hparams["n_layer"]) self.gguf_writer.add_ssm_conv_kernel(d_conv) self.gguf_writer.add_ssm_inner_size(d_inner) @@ -2366,7 +2791,7 @@ def write_tensors(self): tok_embd = None tok_embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD] + ".weight" - output_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight" + output_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight" for name, data_torch in self.get_tensors(): old_dtype = data_torch.dtype @@ -2407,8 +2832,17 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert big float32 2-dim weight tensors to float16 - new_weight_name = new_name[:-len(".weight")] if new_name.endswith(".weight") else "" - if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2: + new_weight_name = ( + new_name[: -len(".weight")] if new_name.endswith(".weight") else "" + ) + if ( + self.ftype == 1 + and data_dtype == np.float32 + and new_weight_name.endswith( + (".ssm_in", ".ssm_out", "token_embd", "output") + ) + and n_dims == 2 + ): data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -2438,25 +2872,36 @@ def set_gguf_parameters(self): def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="Convert a huggingface model to a GGML compatible file") + description="Convert a huggingface model to a GGML compatible file" + ) parser.add_argument( - "--vocab-only", action="store_true", + "--vocab-only", + action="store_true", help="extract only the vocab", ) parser.add_argument( - "--awq-path", type=Path, default=None, - help="Path to scale awq cache file") + "--awq-path", type=Path, default=None, help="Path to scale awq cache file" + ) parser.add_argument( - "--outfile", type=Path, + "--outfile", + type=Path, help="path to write to; default: based on input", ) parser.add_argument( - "--outtype", type=str, choices=["f32", "f16"], default="f16", + "--outtype", + type=str, + choices=["f32", "f16"], + default="f16", help="output format - use f32 for float32, f16 for float16", ) - parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") parser.add_argument( - "model", type=Path, + "--bigendian", + action="store_true", + help="model is executed on big endian machine", + ) + parser.add_argument( + "model", + type=Path, help="directory containing model file", ) @@ -2469,8 +2914,9 @@ def main() -> None: dir_model = args.model if args.awq_path: - sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) + sys.path.insert(1, str(Path(__file__).parent / "awq-py")) from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] + tmp_model_path = args.model / "weighted_model" dir_model = tmp_model_path if tmp_model_path.is_dir(): @@ -2482,19 +2928,18 @@ def main() -> None: print(f"Saved weighted model at {tmp_model_path}.") if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file=sys.stderr) + print(f"Error: {args.model} is not a directory", file=sys.stderr) sys.exit(1) ftype_map = { "f32": gguf.GGMLQuantizationType.F32, "f16": gguf.GGMLQuantizationType.F16, } - if args.outfile is not None: fname_out = args.outfile else: # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{args.outtype}.gguf' + fname_out = dir_model / f"ggml-model-{args.outtype}.gguf" print(f"Loading model: {dir_model.name}") @@ -2502,7 +2947,9 @@ def main() -> None: with torch.inference_mode(): model_class = Model.from_model_architecture(hparams["architectures"][0]) - model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian) + model_instance = model_class( + dir_model, ftype_map[args.outtype], fname_out, args.bigendian + ) print("Set model parameters") model_instance.set_gguf_parameters() @@ -2520,5 +2967,5 @@ def main() -> None: print(f"Model successfully exported to '{fname_out}'") -if __name__ == '__main__': +if __name__ == "__main__": main() From 13387d9c57ec255ecccb22bf4d06999f3bb78987 Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Fri, 12 Apr 2024 02:18:51 -0700 Subject: [PATCH 02/25] StableLM12 tensormapping and constants --- gguf-py/gguf/constants.py | 603 +++++++++++++++++---------------- gguf-py/gguf/tensor_mapping.py | 336 +++++++++--------- 2 files changed, 482 insertions(+), 457 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 2566b2fb8f296..b5248b7497dbd 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -8,8 +8,8 @@ # constants # -GGUF_MAGIC = 0x46554747 # "GGUF" -GGUF_VERSION = 3 +GGUF_MAGIC = 0x46554747 # "GGUF" +GGUF_VERSION = 3 GGUF_DEFAULT_ALIGNMENT = 32 # @@ -19,77 +19,79 @@ class Keys: class General: - ARCHITECTURE = "general.architecture" + ARCHITECTURE = "general.architecture" QUANTIZATION_VERSION = "general.quantization_version" - ALIGNMENT = "general.alignment" - NAME = "general.name" - AUTHOR = "general.author" - VERSION = "general.version" - URL = "general.url" - DESCRIPTION = "general.description" - LICENSE = "general.license" - SOURCE_URL = "general.source.url" - SOURCE_HF_REPO = "general.source.huggingface.repository" - FILE_TYPE = "general.file_type" + ALIGNMENT = "general.alignment" + NAME = "general.name" + AUTHOR = "general.author" + VERSION = "general.version" + URL = "general.url" + DESCRIPTION = "general.description" + LICENSE = "general.license" + SOURCE_URL = "general.source.url" + SOURCE_HF_REPO = "general.source.huggingface.repository" + FILE_TYPE = "general.file_type" class LLM: - VOCAB_SIZE = "{arch}.vocab_size" - CONTEXT_LENGTH = "{arch}.context_length" - EMBEDDING_LENGTH = "{arch}.embedding_length" - BLOCK_COUNT = "{arch}.block_count" - FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" + VOCAB_SIZE = "{arch}.vocab_size" + CONTEXT_LENGTH = "{arch}.context_length" + EMBEDDING_LENGTH = "{arch}.embedding_length" + BLOCK_COUNT = "{arch}.block_count" + FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" - TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" - EXPERT_COUNT = "{arch}.expert_count" - EXPERT_USED_COUNT = "{arch}.expert_used_count" - POOLING_TYPE = "{arch}.pooling_type" - LOGIT_SCALE = "{arch}.logit_scale" + TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" + EXPERT_COUNT = "{arch}.expert_count" + EXPERT_USED_COUNT = "{arch}.expert_used_count" + POOLING_TYPE = "{arch}.pooling_type" + LOGIT_SCALE = "{arch}.logit_scale" class Attention: - HEAD_COUNT = "{arch}.attention.head_count" - HEAD_COUNT_KV = "{arch}.attention.head_count_kv" - MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" - CLAMP_KQV = "{arch}.attention.clamp_kqv" - KEY_LENGTH = "{arch}.attention.key_length" - VALUE_LENGTH = "{arch}.attention.value_length" - LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" + HEAD_COUNT = "{arch}.attention.head_count" + HEAD_COUNT_KV = "{arch}.attention.head_count_kv" + MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" + CLAMP_KQV = "{arch}.attention.clamp_kqv" + KEY_LENGTH = "{arch}.attention.key_length" + VALUE_LENGTH = "{arch}.attention.value_length" + LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" - CAUSAL = "{arch}.attention.causal" + CAUSAL = "{arch}.attention.causal" class Rope: - DIMENSION_COUNT = "{arch}.rope.dimension_count" - FREQ_BASE = "{arch}.rope.freq_base" - SCALING_TYPE = "{arch}.rope.scaling.type" - SCALING_FACTOR = "{arch}.rope.scaling.factor" + DIMENSION_COUNT = "{arch}.rope.dimension_count" + FREQ_BASE = "{arch}.rope.freq_base" + SCALING_TYPE = "{arch}.rope.scaling.type" + SCALING_FACTOR = "{arch}.rope.scaling.factor" SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" - SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" + SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" class SSM: - CONV_KERNEL = "{arch}.ssm.conv_kernel" - INNER_SIZE = "{arch}.ssm.inner_size" - STATE_SIZE = "{arch}.ssm.state_size" + CONV_KERNEL = "{arch}.ssm.conv_kernel" + INNER_SIZE = "{arch}.ssm.inner_size" + STATE_SIZE = "{arch}.ssm.state_size" TIME_STEP_RANK = "{arch}.ssm.time_step_rank" class Tokenizer: - MODEL = "tokenizer.ggml.model" - LIST = "tokenizer.ggml.tokens" - TOKEN_TYPE = "tokenizer.ggml.token_type" - TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types - SCORES = "tokenizer.ggml.scores" - MERGES = "tokenizer.ggml.merges" - BOS_ID = "tokenizer.ggml.bos_token_id" - EOS_ID = "tokenizer.ggml.eos_token_id" - UNK_ID = "tokenizer.ggml.unknown_token_id" - SEP_ID = "tokenizer.ggml.seperator_token_id" - PAD_ID = "tokenizer.ggml.padding_token_id" - CLS_ID = "tokenizer.ggml.cls_token_id" - MASK_ID = "tokenizer.ggml.mask_token_id" - ADD_BOS = "tokenizer.ggml.add_bos_token" - ADD_EOS = "tokenizer.ggml.add_eos_token" - ADD_PREFIX = "tokenizer.ggml.add_space_prefix" - HF_JSON = "tokenizer.huggingface.json" - RWKV = "tokenizer.rwkv.world" - CHAT_TEMPLATE = "tokenizer.chat_template" + MODEL = "tokenizer.ggml.model" + LIST = "tokenizer.ggml.tokens" + TOKEN_TYPE = "tokenizer.ggml.token_type" + TOKEN_TYPE_COUNT = ( + "tokenizer.ggml.token_type_count" # for BERT-style token types + ) + SCORES = "tokenizer.ggml.scores" + MERGES = "tokenizer.ggml.merges" + BOS_ID = "tokenizer.ggml.bos_token_id" + EOS_ID = "tokenizer.ggml.eos_token_id" + UNK_ID = "tokenizer.ggml.unknown_token_id" + SEP_ID = "tokenizer.ggml.seperator_token_id" + PAD_ID = "tokenizer.ggml.padding_token_id" + CLS_ID = "tokenizer.ggml.cls_token_id" + MASK_ID = "tokenizer.ggml.mask_token_id" + ADD_BOS = "tokenizer.ggml.add_bos_token" + ADD_EOS = "tokenizer.ggml.add_eos_token" + ADD_PREFIX = "tokenizer.ggml.add_space_prefix" + HF_JSON = "tokenizer.huggingface.json" + RWKV = "tokenizer.rwkv.world" + CHAT_TEMPLATE = "tokenizer.chat_template" # @@ -98,30 +100,31 @@ class Tokenizer: class MODEL_ARCH(IntEnum): - LLAMA = auto() - FALCON = auto() - BAICHUAN = auto() - GROK = auto() - GPT2 = auto() - GPTJ = auto() - GPTNEOX = auto() - MPT = auto() - STARCODER = auto() - PERSIMMON = auto() - REFACT = auto() - BERT = auto() + LLAMA = auto() + FALCON = auto() + BAICHUAN = auto() + GROK = auto() + GPT2 = auto() + GPTJ = auto() + GPTNEOX = auto() + MPT = auto() + STARCODER = auto() + PERSIMMON = auto() + REFACT = auto() + BERT = auto() NOMIC_BERT = auto() - BLOOM = auto() - STABLELM = auto() - QWEN = auto() - QWEN2 = auto() - PHI2 = auto() - PLAMO = auto() - CODESHELL = auto() - ORION = auto() - INTERNLM2 = auto() - MINICPM = auto() - GEMMA = auto() + BLOOM = auto() + STABLELM = auto() + STABLELM2 = auto() + QWEN = auto() + QWEN2 = auto() + PHI2 = auto() + PLAMO = auto() + CODESHELL = auto() + ORION = auto() + INTERNLM2 = auto() + MINICPM = auto() + GEMMA = auto() STARCODER2 = auto() MAMBA = auto() XVERSE = auto() @@ -130,111 +133,111 @@ class MODEL_ARCH(IntEnum): class MODEL_TENSOR(IntEnum): - TOKEN_EMBD = auto() + TOKEN_EMBD = auto() TOKEN_EMBD_NORM = auto() - TOKEN_TYPES = auto() - POS_EMBD = auto() - OUTPUT = auto() - OUTPUT_NORM = auto() - ROPE_FREQS = auto() - ATTN_Q = auto() - ATTN_K = auto() - ATTN_V = auto() - ATTN_QKV = auto() - ATTN_OUT = auto() - ATTN_NORM = auto() - ATTN_NORM_2 = auto() - ATTN_OUT_NORM = auto() - ATTN_ROT_EMBD = auto() - FFN_GATE_INP = auto() - FFN_NORM = auto() - FFN_GATE = auto() - FFN_DOWN = auto() - FFN_UP = auto() - FFN_ACT = auto() - FFN_GATE_EXP = auto() - FFN_DOWN_EXP = auto() - FFN_UP_EXP = auto() - ATTN_Q_NORM = auto() - ATTN_K_NORM = auto() - LAYER_OUT_NORM = auto() - SSM_IN = auto() - SSM_CONV1D = auto() - SSM_X = auto() - SSM_DT = auto() - SSM_A = auto() - SSM_D = auto() - SSM_OUT = auto() + TOKEN_TYPES = auto() + POS_EMBD = auto() + OUTPUT = auto() + OUTPUT_NORM = auto() + ROPE_FREQS = auto() + ATTN_Q = auto() + ATTN_K = auto() + ATTN_V = auto() + ATTN_QKV = auto() + ATTN_OUT = auto() + ATTN_NORM = auto() + ATTN_NORM_2 = auto() + ATTN_OUT_NORM = auto() + ATTN_ROT_EMBD = auto() + FFN_GATE_INP = auto() + FFN_NORM = auto() + FFN_GATE = auto() + FFN_DOWN = auto() + FFN_UP = auto() + FFN_ACT = auto() + FFN_GATE_EXP = auto() + FFN_DOWN_EXP = auto() + FFN_UP_EXP = auto() + ATTN_Q_NORM = auto() + ATTN_K_NORM = auto() + LAYER_OUT_NORM = auto() + SSM_IN = auto() + SSM_CONV1D = auto() + SSM_X = auto() + SSM_DT = auto() + SSM_A = auto() + SSM_D = auto() + SSM_OUT = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { - MODEL_ARCH.LLAMA: "llama", - MODEL_ARCH.FALCON: "falcon", - MODEL_ARCH.BAICHUAN: "baichuan", - MODEL_ARCH.GROK: "grok", - MODEL_ARCH.GPT2: "gpt2", - MODEL_ARCH.GPTJ: "gptj", - MODEL_ARCH.GPTNEOX: "gptneox", - MODEL_ARCH.MPT: "mpt", - MODEL_ARCH.STARCODER: "starcoder", - MODEL_ARCH.PERSIMMON: "persimmon", - MODEL_ARCH.REFACT: "refact", - MODEL_ARCH.BERT: "bert", - MODEL_ARCH.NOMIC_BERT: "nomic-bert", - MODEL_ARCH.BLOOM: "bloom", - MODEL_ARCH.STABLELM: "stablelm", - MODEL_ARCH.QWEN: "qwen", - MODEL_ARCH.QWEN2: "qwen2", - MODEL_ARCH.PHI2: "phi2", - MODEL_ARCH.PLAMO: "plamo", - MODEL_ARCH.CODESHELL: "codeshell", - MODEL_ARCH.ORION: "orion", - MODEL_ARCH.INTERNLM2: "internlm2", - MODEL_ARCH.MINICPM: "minicpm", - MODEL_ARCH.GEMMA: "gemma", - MODEL_ARCH.STARCODER2: "starcoder2", - MODEL_ARCH.MAMBA: "mamba", - MODEL_ARCH.XVERSE: "xverse", - MODEL_ARCH.COMMAND_R: "command-r", - MODEL_ARCH.DBRX: "dbrx", + MODEL_ARCH.LLAMA: "llama", + MODEL_ARCH.FALCON: "falcon", + MODEL_ARCH.BAICHUAN: "baichuan", + MODEL_ARCH.GROK: "grok", + MODEL_ARCH.GPT2: "gpt2", + MODEL_ARCH.GPTJ: "gptj", + MODEL_ARCH.GPTNEOX: "gptneox", + MODEL_ARCH.MPT: "mpt", + MODEL_ARCH.STARCODER: "starcoder", + MODEL_ARCH.PERSIMMON: "persimmon", + MODEL_ARCH.REFACT: "refact", + MODEL_ARCH.BERT: "bert", + MODEL_ARCH.NOMIC_BERT: "nomic-bert", + MODEL_ARCH.BLOOM: "bloom", + MODEL_ARCH.STABLELM: "stablelm", + MODEL_ARCH.STABLELM2: "stablelm2", + MODEL_ARCH.QWEN: "qwen", + MODEL_ARCH.QWEN2: "qwen2", + MODEL_ARCH.PHI2: "phi2", + MODEL_ARCH.PLAMO: "plamo", + MODEL_ARCH.CODESHELL: "codeshell", + MODEL_ARCH.ORION: "orion", + MODEL_ARCH.INTERNLM2: "internlm2", + MODEL_ARCH.MINICPM: "minicpm", + MODEL_ARCH.GEMMA: "gemma", + MODEL_ARCH.STARCODER2: "starcoder2", + MODEL_ARCH.MAMBA: "mamba", + MODEL_ARCH.XVERSE: "xverse", + MODEL_ARCH.COMMAND_R: "command-r", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.TOKEN_EMBD: "token_embd", MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", - MODEL_TENSOR.TOKEN_TYPES: "token_types", - MODEL_TENSOR.POS_EMBD: "position_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.ROPE_FREQS: "rope_freqs", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", - MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", - MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", - MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", - MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", - MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", - MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", - MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", - MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", - MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", - MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", - MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", - MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", - MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", - MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", - MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", - MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", - MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", - MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", - MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", - MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", + MODEL_TENSOR.TOKEN_TYPES: "token_types", + MODEL_TENSOR.POS_EMBD: "position_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ROPE_FREQS: "rope_freqs", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", + MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", + MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", + MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", + MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", + MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", + MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", + MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", + MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", + MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", + MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", + MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", + MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", + MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", + MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", + MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", + MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", + MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -440,6 +443,24 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + ], + MODEL_ARCH.STABLELM2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, ], MODEL_ARCH.QWEN: [ MODEL_TENSOR.TOKEN_EMBD, @@ -701,55 +722,55 @@ class MODEL_TENSOR(IntEnum): class TokenType(IntEnum): - NORMAL = 1 - UNKNOWN = 2 - CONTROL = 3 + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 USER_DEFINED = 4 - UNUSED = 5 - BYTE = 6 + UNUSED = 5 + BYTE = 6 class RopeScalingType(Enum): - NONE = 'none' - LINEAR = 'linear' - YARN = 'yarn' + NONE = "none" + LINEAR = "linear" + YARN = "yarn" class PoolingType(IntEnum): NONE = 0 MEAN = 1 - CLS = 2 + CLS = 2 class GGMLQuantizationType(IntEnum): - F32 = 0 - F16 = 1 - Q4_0 = 2 - Q4_1 = 3 - Q5_0 = 6 - Q5_1 = 7 - Q8_0 = 8 - Q8_1 = 9 - Q2_K = 10 - Q3_K = 11 - Q4_K = 12 - Q5_K = 13 - Q6_K = 14 - Q8_K = 15 + F32 = 0 + F16 = 1 + Q4_0 = 2 + Q4_1 = 3 + Q5_0 = 6 + Q5_1 = 7 + Q8_0 = 8 + Q8_1 = 9 + Q2_K = 10 + Q3_K = 11 + Q4_K = 12 + Q5_K = 13 + Q6_K = 14 + Q8_K = 15 IQ2_XXS = 16 - IQ2_XS = 17 + IQ2_XS = 17 IQ3_XXS = 18 - IQ1_S = 19 - IQ4_NL = 20 - IQ3_S = 21 - IQ2_S = 22 - IQ4_XS = 23 - I8 = 24 - I16 = 25 - I32 = 26 - I64 = 27 - F64 = 28 - IQ1_M = 29 + IQ1_S = 19 + IQ4_NL = 20 + IQ3_S = 21 + IQ2_S = 22 + IQ4_XS = 23 + I8 = 24 + I16 = 25 + I32 = 26 + I64 = 27 + F64 = 28 + IQ1_M = 29 class GGUFEndian(IntEnum): @@ -758,18 +779,18 @@ class GGUFEndian(IntEnum): class GGUFValueType(IntEnum): - UINT8 = 0 - INT8 = 1 - UINT16 = 2 - INT16 = 3 - UINT32 = 4 - INT32 = 5 + UINT8 = 0 + INT8 = 1 + UINT16 = 2 + INT16 = 3 + UINT32 = 4 + INT32 = 5 FLOAT32 = 6 - BOOL = 7 - STRING = 8 - ARRAY = 9 - UINT64 = 10 - INT64 = 11 + BOOL = 7 + STRING = 8 + ARRAY = 9 + UINT64 = 10 + INT64 = 11 FLOAT64 = 12 @staticmethod @@ -794,94 +815,94 @@ def get_type(val: Any) -> GGUFValueType: QK_K = 256 # Items here are (block size, type size) GGML_QUANT_SIZES = { - GGMLQuantizationType.F32: (1, 4), - GGMLQuantizationType.F16: (1, 2), - GGMLQuantizationType.Q4_0: (32, 2 + 16), - GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), - GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), - GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), - GGMLQuantizationType.Q8_0: (32, 2 + 32), - GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), - GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), - GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), - GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), - GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), - GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), - GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), + GGMLQuantizationType.F32: (1, 4), + GGMLQuantizationType.F16: (1, 2), + GGMLQuantizationType.Q4_0: (32, 2 + 16), + GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), + GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), + GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), + GGMLQuantizationType.Q8_0: (32, 2 + 32), + GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), + GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), + GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), + GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), + GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), + GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), + GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4), - GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32), + GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32), GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8), - GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16), - GGMLQuantizationType.IQ4_NL: (32, 2 + 16), - GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), - GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), - GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), - GGMLQuantizationType.I8: (1, 1), - GGMLQuantizationType.I16: (1, 2), - GGMLQuantizationType.I32: (1, 4), - GGMLQuantizationType.I64: (1, 8), - GGMLQuantizationType.F64: (1, 8), + GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16), + GGMLQuantizationType.IQ4_NL: (32, 2 + 16), + GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), + GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), + GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), + GGMLQuantizationType.I8: (1, 1), + GGMLQuantizationType.I16: (1, 2), + GGMLQuantizationType.I32: (1, 4), + GGMLQuantizationType.I64: (1, 8), + GGMLQuantizationType.F64: (1, 8), } # Aliases for backward compatibility. # general -KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE +KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION -KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT -KEY_GENERAL_NAME = Keys.General.NAME -KEY_GENERAL_AUTHOR = Keys.General.AUTHOR -KEY_GENERAL_URL = Keys.General.URL -KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION -KEY_GENERAL_LICENSE = Keys.General.LICENSE -KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL -KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO -KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE +KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT +KEY_GENERAL_NAME = Keys.General.NAME +KEY_GENERAL_AUTHOR = Keys.General.AUTHOR +KEY_GENERAL_URL = Keys.General.URL +KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION +KEY_GENERAL_LICENSE = Keys.General.LICENSE +KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL +KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO +KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE # LLM -KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE -KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH -KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH -KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT -KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH +KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE +KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH +KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH +KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT +KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL -KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT +KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT # attention -KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT -KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV -KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS -KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV -KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS +KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT +KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV +KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS +KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV +KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS # RoPE -KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT -KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE -KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE -KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR +KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT +KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE +KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE +KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN -KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED +KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED # SSM -KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL -KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE -KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE +KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL +KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE +KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK # tokenization -KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL -KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST +KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL +KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE -KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES -KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES -KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID -KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID -KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID -KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID -KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID -KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID -KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID -KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON -KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV +KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES +KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES +KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID +KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID +KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID +KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID +KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID +KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID +KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID +KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON +KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index ec6fcbb838425..f83d27ff75bb6 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -17,43 +17,38 @@ class TensorNameMap: "tok_embeddings", # llama-pth "embeddings.word_embeddings", # bert nomic-bert "language_model.embedding.word_embeddings", # persimmon - "wte", # gpt2 - "transformer.embd.wte", # phi2 - "model.tok_embeddings", # internlm2 - "model.embedding", # mamba-qbert - "backbone.embedding", # mamba - "backbone.embeddings", # mamba-hf - "transformer.in_out_embed", # Grok + "wte", # gpt2 + "transformer.embd.wte", # phi2 + "model.tok_embeddings", # internlm2 + "model.embedding", # mamba-qbert + "backbone.embedding", # mamba + "backbone.embeddings", # mamba-hf + "transformer.in_out_embed", # Grok ), - # Token type embeddings MODEL_TENSOR.TOKEN_TYPES: ( "embeddings.token_type_embeddings", # bert nomic-bert ), - # Normalization of token embeddings MODEL_TENSOR.TOKEN_EMBD_NORM: ( "word_embeddings_layernorm", # bloom - "embeddings.LayerNorm", # bert - "emb_ln", # nomic-bert + "embeddings.LayerNorm", # bert + "emb_ln", # nomic-bert ), - # Position embeddings MODEL_TENSOR.POS_EMBD: ( - "transformer.wpe", # gpt2 + "transformer.wpe", # gpt2 "embeddings.position_embeddings", # bert - "wpe", # gpt2 + "wpe", # gpt2 ), - # Output MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon - "lm_head.linear", # phi2 + "lm_head.linear", # phi2 ), - # Output norm MODEL_TENSOR.OUTPUT_NORM: ( "gpt_neox.final_layer_norm", # gptneox @@ -63,30 +58,27 @@ class TensorNameMap: "transformer.norm_f", # mpt dbrx "ln_f", # refact bloom qwen gpt2 "language_model.encoder.final_layernorm", # persimmon - "model.final_layernorm", # persimmon - "lm_head.ln", # phi2 - "model.norm_f", # mamba-qbert - "backbone.norm_f", # mamba - "transformer.rms_norm", # Grok + "model.final_layernorm", # persimmon + "lm_head.ln", # phi2 + "model.norm_f", # mamba-qbert + "backbone.norm_f", # mamba + "transformer.rms_norm", # Grok ), - # Rope frequencies - MODEL_TENSOR.ROPE_FREQS: ( - "rope.freqs", # llama-pth - ), + MODEL_TENSOR.ROPE_FREQS: ("rope.freqs",), # llama-pth } block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { # Attention norm MODEL_TENSOR.ATTN_NORM: ( - "gpt_neox.layers.{bid}.input_layernorm", # gptneox - "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen - "transformer.blocks.{bid}.norm_1", # mpt - "transformer.h.{bid}.input_layernorm", # falcon7b - "h.{bid}.input_layernorm", # bloom - "transformer.h.{bid}.ln_mlp", # falcon40b - "model.layers.{bid}.input_layernorm", # llama-hf - "layers.{bid}.attention_norm", # llama-pth + "gpt_neox.layers.{bid}.input_layernorm", # gptneox + "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen + "transformer.blocks.{bid}.norm_1", # mpt + "transformer.h.{bid}.input_layernorm", # falcon7b + "h.{bid}.input_layernorm", # bloom + "transformer.h.{bid}.ln_mlp", # falcon40b + "model.layers.{bid}.input_layernorm", # llama-hf + "layers.{bid}.attention_norm", # llama-pth "language_model.encoder.layers.{bid}.input_layernorm", # persimmon "model.layers.{bid}.ln1", # yi "h.{bid}.ln_1", # gpt2 @@ -98,12 +90,8 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.rms_norm", # Grok "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx ), - # Attention norm 2 - MODEL_TENSOR.ATTN_NORM_2: ( - "transformer.h.{bid}.ln_attn", # falcon40b - ), - + MODEL_TENSOR.ATTN_NORM_2: ("transformer.h.{bid}.ln_attn",), # falcon40b # Attention query-key-value MODEL_TENSOR.ATTN_QKV: ( "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox @@ -113,45 +101,41 @@ class TensorNameMap: "transformer.h.{bid}.self_attention.query_key_value", # falcon "h.{bid}.self_attention.query_key_value", # bloom "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon - "model.layers.{bid}.self_attn.query_key_value", # persimmon - "h.{bid}.attn.c_attn", # gpt2 - "transformer.h.{bid}.mixer.Wqkv", # phi2 - "encoder.layers.{bid}.attn.Wqkv", # nomic-bert + "model.layers.{bid}.self_attn.query_key_value", # persimmon + "h.{bid}.attn.c_attn", # gpt2 + "transformer.h.{bid}.mixer.Wqkv", # phi2 + "encoder.layers.{bid}.attn.Wqkv", # nomic-bert ), - # Attention query MODEL_TENSOR.ATTN_Q: ( - "model.layers.{bid}.self_attn.q_proj", # llama-hf - "layers.{bid}.attention.wq", # llama-pth - "encoder.layer.{bid}.attention.self.query", # bert - "transformer.h.{bid}.attn.q_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.q_proj", # plamo - "model.layers.{bid}.attention.wq", # internlm2 - "transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok + "model.layers.{bid}.self_attn.q_proj", # llama-hf + "layers.{bid}.attention.wq", # llama-pth + "encoder.layer.{bid}.attention.self.query", # bert + "transformer.h.{bid}.attn.q_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.q_proj", # plamo + "model.layers.{bid}.attention.wq", # internlm2 + "transformer.decoder_layer.{bid}.multi_head_attention.query", # Grok ), - # Attention key MODEL_TENSOR.ATTN_K: ( - "model.layers.{bid}.self_attn.k_proj", # llama-hf - "layers.{bid}.attention.wk", # llama-pth - "encoder.layer.{bid}.attention.self.key", # bert - "transformer.h.{bid}.attn.k_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.k_proj", # plamo - "model.layers.{bid}.attention.wk", # internlm2 - "transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok + "model.layers.{bid}.self_attn.k_proj", # llama-hf + "layers.{bid}.attention.wk", # llama-pth + "encoder.layer.{bid}.attention.self.key", # bert + "transformer.h.{bid}.attn.k_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.k_proj", # plamo + "model.layers.{bid}.attention.wk", # internlm2 + "transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok ), - # Attention value MODEL_TENSOR.ATTN_V: ( - "model.layers.{bid}.self_attn.v_proj", # llama-hf - "layers.{bid}.attention.wv", # llama-pth - "encoder.layer.{bid}.attention.self.value", # bert - "transformer.h.{bid}.attn.v_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.v_proj", # plamo - "model.layers.{bid}.attention.wv", # internlm2 - "transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok + "model.layers.{bid}.self_attn.v_proj", # llama-hf + "layers.{bid}.attention.wv", # llama-pth + "encoder.layer.{bid}.attention.self.value", # bert + "transformer.h.{bid}.attn.v_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.v_proj", # plamo + "model.layers.{bid}.attention.wv", # internlm2 + "transformer.decoder_layer.{bid}.multi_head_attention.value", # Grok ), - # Attention output MODEL_TENSOR.ATTN_OUT: ( "gpt_neox.layers.{bid}.attention.dense", # gptneox @@ -172,8 +156,7 @@ class TensorNameMap: "encoder.layers.{bid}.attn.out_proj", # nomic-bert "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx - ), - + ), # Attention output norm MODEL_TENSOR.ATTN_OUT_NORM: ( "encoder.layer.{bid}.attention.output.LayerNorm", # bert @@ -181,169 +164,186 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.rms_norm_1", # Grok "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx ), - # Rotary embeddings MODEL_TENSOR.ATTN_ROT_EMBD: ( - "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf - "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth - "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo - "transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell + "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf + "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth + "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo + "transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell ), - # Feed-forward norm MODEL_TENSOR.FFN_NORM: ( - "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox - "transformer.h.{bid}.ln_2", # gpt2 refact qwen - "h.{bid}.post_attention_layernorm", # bloom - "transformer.blocks.{bid}.norm_2", # mpt - "model.layers.{bid}.post_attention_layernorm", # llama-hf - "layers.{bid}.ffn_norm", # llama-pth + "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox + "transformer.h.{bid}.ln_2", # gpt2 refact qwen + "h.{bid}.post_attention_layernorm", # bloom + "transformer.blocks.{bid}.norm_2", # mpt + "model.layers.{bid}.post_attention_layernorm", # llama-hf + "layers.{bid}.ffn_norm", # llama-pth "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon - "model.layers.{bid}.ln2", # yi - "h.{bid}.ln_2", # gpt2 - "model.layers.{bid}.ffn_norm", # internlm2 - "transformer.decoder_layer.{bid}.rms_norm_2", # Grok + "model.layers.{bid}.ln2", # yi + "h.{bid}.ln_2", # gpt2 + "model.layers.{bid}.ffn_norm", # internlm2 + "transformer.decoder_layer.{bid}.rms_norm_2", # Grok ), - MODEL_TENSOR.FFN_GATE_INP: ( "layers.{bid}.feed_forward.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral "transformer.decoder_layer.{bid}.router", # Grok "transformer.blocks.{bid}.ffn.router.layer", # dbrx ), - # Feed-forward up MODEL_TENSOR.FFN_UP: ( - "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox - "transformer.h.{bid}.mlp.c_fc", # gpt2 - "transformer.blocks.{bid}.ffn.up_proj", # mpt - "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon - "h.{bid}.mlp.dense_h_to_4h", # bloom - "model.layers.{bid}.mlp.up_proj", # llama-hf refact - "layers.{bid}.feed_forward.w3", # llama-pth - "encoder.layer.{bid}.intermediate.dense", # bert - "transformer.h.{bid}.mlp.fc_in", # gpt-j + "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox + "transformer.h.{bid}.mlp.c_fc", # gpt2 + "transformer.blocks.{bid}.ffn.up_proj", # mpt + "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon + "h.{bid}.mlp.dense_h_to_4h", # bloom + "model.layers.{bid}.mlp.up_proj", # llama-hf refact + "layers.{bid}.feed_forward.w3", # llama-pth + "encoder.layer.{bid}.intermediate.dense", # bert + "transformer.h.{bid}.mlp.fc_in", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon - "model.layers.{bid}.mlp.dense_h_to_4h", # persimmon - "transformer.h.{bid}.mlp.w1", # qwen - "h.{bid}.mlp.c_fc", # gpt2 - "transformer.h.{bid}.mlp.fc1", # phi2 - "model.layers.{bid}.mlp.fc1", # phi2 - "model.layers.layers.{bid}.mlp.up_proj", # plamo - "model.layers.{bid}.feed_forward.w3", # internlm2 - "encoder.layers.{bid}.mlp.fc11", # nomic-bert - "model.layers.{bid}.mlp.c_fc", # starcoder2 + "model.layers.{bid}.mlp.dense_h_to_4h", # persimmon + "transformer.h.{bid}.mlp.w1", # qwen + "h.{bid}.mlp.c_fc", # gpt2 + "transformer.h.{bid}.mlp.fc1", # phi2 + "model.layers.{bid}.mlp.fc1", # phi2 + "model.layers.layers.{bid}.mlp.up_proj", # plamo + "model.layers.{bid}.feed_forward.w3", # internlm2 + "encoder.layers.{bid}.mlp.fc11", # nomic-bert + "model.layers.{bid}.mlp.c_fc", # starcoder2 ), - MODEL_TENSOR.FFN_UP_EXP: ( "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx ), - # AWQ-activation gate - MODEL_TENSOR.FFN_ACT: ( - "transformer.blocks.{bid}.ffn.act", # mpt - ), - + MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), # mpt # Feed-forward gate MODEL_TENSOR.FFN_GATE: ( - "model.layers.{bid}.mlp.gate_proj", # llama-hf refact - "layers.{bid}.feed_forward.w1", # llama-pth - "transformer.h.{bid}.mlp.w2", # qwen - "model.layers.layers.{bid}.mlp.gate_proj", # plamo - "model.layers.{bid}.feed_forward.w1", # internlm2 - "encoder.layers.{bid}.mlp.fc12", # nomic-bert + "model.layers.{bid}.mlp.gate_proj", # llama-hf refact + "layers.{bid}.feed_forward.w1", # llama-pth + "transformer.h.{bid}.mlp.w2", # qwen + "model.layers.layers.{bid}.mlp.gate_proj", # plamo + "model.layers.{bid}.feed_forward.w1", # internlm2 + "encoder.layers.{bid}.mlp.fc12", # nomic-bert ), - MODEL_TENSOR.FFN_GATE_EXP: ( "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx - ), - + ), # Feed-forward down MODEL_TENSOR.FFN_DOWN: ( - "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox - "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen - "transformer.blocks.{bid}.ffn.down_proj", # mpt - "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon - "h.{bid}.mlp.dense_4h_to_h", # bloom - "model.layers.{bid}.mlp.down_proj", # llama-hf - "layers.{bid}.feed_forward.w2", # llama-pth - "encoder.layer.{bid}.output.dense", # bert - "transformer.h.{bid}.mlp.fc_out", # gpt-j + "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox + "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen + "transformer.blocks.{bid}.ffn.down_proj", # mpt + "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon + "h.{bid}.mlp.dense_4h_to_h", # bloom + "model.layers.{bid}.mlp.down_proj", # llama-hf + "layers.{bid}.feed_forward.w2", # llama-pth + "encoder.layer.{bid}.output.dense", # bert + "transformer.h.{bid}.mlp.fc_out", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon - "model.layers.{bid}.mlp.dense_4h_to_h", # persimmon - "h.{bid}.mlp.c_proj", # gpt2 - "transformer.h.{bid}.mlp.fc2", # phi2 - "model.layers.{bid}.mlp.fc2", # phi2 - "model.layers.layers.{bid}.mlp.down_proj", # plamo - "model.layers.{bid}.feed_forward.w2", # internlm2 - "encoder.layers.{bid}.mlp.fc2", # nomic-bert - "model.layers.{bid}.mlp.c_proj", # starcoder2 + "model.layers.{bid}.mlp.dense_4h_to_h", # persimmon + "h.{bid}.mlp.c_proj", # gpt2 + "transformer.h.{bid}.mlp.fc2", # phi2 + "model.layers.{bid}.mlp.fc2", # phi2 + "model.layers.layers.{bid}.mlp.down_proj", # plamo + "model.layers.{bid}.feed_forward.w2", # internlm2 + "encoder.layers.{bid}.mlp.fc2", # nomic-bert + "model.layers.{bid}.mlp.c_proj", # starcoder2 ), - MODEL_TENSOR.FFN_DOWN_EXP: ( "layers.{bid}.feed_forward.experts.w2", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx ), - MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", - "model.layers.{bid}.self_attn.q_layernorm", # persimmon - "model.layers.{bid}.self_attn.q_norm", # cohere - "transformer.blocks.{bid}.attn.q_ln", # sea-lion + "model.layers.{bid}.self_attn.q_layernorm", # persimmon + "model.layers.{bid}.self_attn.q_layernorm.norms.0", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.1", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.2", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.3", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.4", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.5", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.6", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.7", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.8", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.9", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.10", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.11", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.12", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.13", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.14", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.15", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.16", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.17", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.18", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.19", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.20", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.21", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.22", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.23", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.24", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.25", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.26", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.27", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.28", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.29", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.30", # stablelm + "model.layers.{bid}.self_attn.q_layernorm.norms.31", # stablelm + "model.layers.{bid}.self_attn.q_norm", # cohere + "transformer.blocks.{bid}.attn.q_ln", # sea-lion ), - MODEL_TENSOR.ATTN_K_NORM: ( "language_model.encoder.layers.{bid}.self_attention.k_layernorm", - "model.layers.{bid}.self_attn.k_layernorm", # persimmon - "model.layers.{bid}.self_attn.k_norm", # cohere - "transformer.blocks.{bid}.attn.k_ln", # sea-lion + "model.layers.{bid}.self_attn.k_layernorm", # persimmon + "model.layers.{bid}.self_attn.k_layernorm.norms.0", # stablelm + "model.layers.{bid}.self_attn.k_layernorm.norms.1", # stablelm + "model.layers.{bid}.self_attn.k_layernorm.norms.2", # stablelm + "model.layers.{bid}.self_attn.k_layernorm.norms.3", # stablelm + "model.layers.{bid}.self_attn.k_layernorm.norms.4", # stablelm + "model.layers.{bid}.self_attn.k_layernorm.norms.5", # stablelm + "model.layers.{bid}.self_attn.k_layernorm.norms.6", # stablelm + "model.layers.{bid}.self_attn.k_layernorm.norms.7", # stablelm + "model.layers.{bid}.self_attn.k_norm", # cohere + "transformer.blocks.{bid}.attn.k_ln", # sea-lion ), - MODEL_TENSOR.ROPE_FREQS: ( "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon ), - MODEL_TENSOR.LAYER_OUT_NORM: ( - "encoder.layer.{bid}.output.LayerNorm", # bert - "encoder.layers.{bid}.norm2", # nomic-bert - "transformer.decoder_layer.{bid}.rms_norm_3", # Grok + "encoder.layer.{bid}.output.LayerNorm", # bert + "encoder.layers.{bid}.norm2", # nomic-bert + "transformer.decoder_layer.{bid}.rms_norm_3", # Grok ), - MODEL_TENSOR.SSM_IN: ( "model.layers.{bid}.in_proj", "backbone.layers.{bid}.mixer.in_proj", ), - MODEL_TENSOR.SSM_CONV1D: ( "model.layers.{bid}.conv1d", "backbone.layers.{bid}.mixer.conv1d", ), - MODEL_TENSOR.SSM_X: ( "model.layers.{bid}.x_proj", "backbone.layers.{bid}.mixer.x_proj", ), - MODEL_TENSOR.SSM_DT: ( "model.layers.{bid}.dt_proj", "backbone.layers.{bid}.mixer.dt_proj", ), - MODEL_TENSOR.SSM_A: ( "model.layers.{bid}.A_log", "backbone.layers.{bid}.mixer.A_log", ), - MODEL_TENSOR.SSM_D: ( "model.layers.{bid}.D", "backbone.layers.{bid}.mixer.D", ), - MODEL_TENSOR.SSM_OUT: ( "model.layers.{bid}.out_proj", "backbone.layers.{bid}.mixer.out_proj", @@ -368,31 +368,35 @@ def __init__(self, arch: MODEL_ARCH, n_blocks: int): # TODO: make this configurable n_experts = 8 for xid in range(n_experts): - tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid) + tensor_name = TENSOR_NAMES[tensor].format(bid=bid, xid=xid) self.mapping[tensor_name] = (tensor, tensor_name) for key in keys: - key = key.format(bid = bid, xid = xid) + key = key.format(bid=bid, xid=xid) self.mapping[key] = (tensor, tensor_name) - def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None: + def get_type_and_name( + self, key: str, try_suffixes: Sequence[str] = () + ) -> tuple[MODEL_TENSOR, str] | None: result = self.mapping.get(key) if result is not None: return result for suffix in try_suffixes: if key.endswith(suffix): - result = self.mapping.get(key[:-len(suffix)]) + result = self.mapping.get(key[: -len(suffix)]) if result is not None: return result[0], result[1] + suffix return None def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None: - result = self.get_type_and_name(key, try_suffixes = try_suffixes) + result = self.get_type_and_name(key, try_suffixes=try_suffixes) if result is None: return None return result[1] - def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None: - result = self.get_type_and_name(key, try_suffixes = try_suffixes) + def get_type( + self, key: str, try_suffixes: Sequence[str] = () + ) -> MODEL_TENSOR | None: + result = self.get_type_and_name(key, try_suffixes=try_suffixes) if result is None: return None return result[0] From b89fa9734d9187b26144d2a3dfe5573fb069c728 Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Fri, 12 Apr 2024 02:28:44 -0700 Subject: [PATCH 03/25] StableLM-2-12b model support --- llama.cpp | 238 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 238 insertions(+) diff --git a/llama.cpp b/llama.cpp index b93c1abcd85d6..12e8e4a2b4246 100644 --- a/llama.cpp +++ b/llama.cpp @@ -207,6 +207,7 @@ enum llm_arch { LLM_ARCH_NOMIC_BERT, LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, + LLM_ARCH_STABLELM2, LLM_ARCH_QWEN, LLM_ARCH_QWEN2, LLM_ARCH_PHI2, @@ -240,6 +241,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, { LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_STABLELM, "stablelm" }, + { LLM_ARCH_STABLELM, "stablelm2" }, { LLM_ARCH_QWEN, "qwen" }, { LLM_ARCH_QWEN2, "qwen2" }, { LLM_ARCH_PHI2, "phi2" }, @@ -704,6 +706,25 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_STABLELM2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + }, + }, { LLM_ARCH_QWEN, { @@ -3857,6 +3878,15 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_STABLELM2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + + switch (hparams.n_layer) { + case 40: model.type = e_model::MODEL_12B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_QWEN: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -5051,6 +5081,39 @@ static bool llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + } + } break; + case LLM_ARCH_STABLELM2: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM,"weight", i), {hparams.n_embd_head_k, hparams.n_head}); + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM,"weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); @@ -5066,6 +5129,35 @@ static bool llm_load_tensors( model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}); + } + } break; + case LLM_ARCH_QWEN: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + for (int i = 0; i < n_layer; ++i) { ggml_context * ctx_layer = ctx_for_layer(i); ggml_context * ctx_split = ctx_for_layer_split(i); @@ -8170,6 +8262,147 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_stablelm2() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + struct ggml_tensor * ffn_inp = cur; + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + if (model.layers[il].attn_q_norm) { + Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens, + ggml_element_size(Qcur) * n_embd_head, + ggml_element_size(Qcur) * n_embd_head * n_head, + 0); + cb(Qcur, "Qcur", il); + Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens, + ggml_element_size(Kcur) * n_embd_head, + ggml_element_size(Kcur) * n_embd_head * n_head_kv, + 0); + cb(Kcur, "Kcur", il); + + + Qcur = llm_build_norm(ctx0, Qcur, hparams, + model.layers[il].attn_q_norm, + NULL, + LLM_NORM, cb, il); + cb(Qcur, "Qcur", il); + + Kcur = llm_build_norm(ctx0, Kcur, hparams, + model.layers[il].attn_k_norm, + NULL, + LLM_NORM, cb, il); + cb(Kcur, "Kcur", il); + } + + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); + } + + struct ggml_tensor * attn_out = cur; + + // feed-forward network + { + cur = llm_build_ffn(ctx0, ffn_inp, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + // add together residual + FFN + self-attention + cur = ggml_add(ctx0, cur, inpL); + cur = ggml_add(ctx0, cur, attn_out); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } struct ggml_cgraph * build_qwen() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -9873,6 +10106,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_stablelm(); } break; + case LLM_ARCH_STABLELM2: + { + result = llm.build_stablelm2(); + } break; case LLM_ARCH_QWEN: { result = llm.build_qwen(); @@ -14791,6 +15028,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_STABLELM: + case LLM_ARCH_STABLELM2: case LLM_ARCH_QWEN: case LLM_ARCH_QWEN2: case LLM_ARCH_PHI2: From b5afc44704fbbde9697c71077a7cb49485eb4d28 Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Fri, 12 Apr 2024 02:32:38 -0700 Subject: [PATCH 04/25] fix --- llama.cpp | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/llama.cpp b/llama.cpp index 12e8e4a2b4246..18950ec2ae72a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5129,35 +5129,6 @@ static bool llm_load_tensors( model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - for (int i = 0; i < n_layer; ++i) { - ggml_context * ctx_layer = ctx_for_layer(i); - ggml_context * ctx_split = ctx_for_layer_split(i); - - auto & layer = model.layers[i]; - - layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - - layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}); - layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}); - layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - - layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - - layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}); - layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}); - layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}); - } - } break; - case LLM_ARCH_QWEN: - { - model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - - // output - { - model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); - } - for (int i = 0; i < n_layer; ++i) { ggml_context * ctx_layer = ctx_for_layer(i); ggml_context * ctx_split = ctx_for_layer_split(i); From 0eb8492ccbaabcb82d365145d2051f87770e6490 Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Fri, 12 Apr 2024 02:33:08 -0700 Subject: [PATCH 05/25] Added 12B support --- llama.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llama.cpp b/llama.cpp index 18950ec2ae72a..e5a9f709d7262 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1729,6 +1729,7 @@ enum e_model { MODEL_4B, MODEL_7B, MODEL_8B, + MODEL_12B, MODEL_13B, MODEL_14B, MODEL_15B, From 15a5e7db4cd7c1997051935486f5fed08e7dd95f Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Fri, 12 Apr 2024 22:48:21 -0700 Subject: [PATCH 06/25] Removed autoformatting; resolved bug where model_arch was not selecting StableLM2 --- convert-hf-to-gguf.py | 800 ++++++++------------------------- gguf-py/gguf/constants.py | 540 +++++++++++----------- gguf-py/gguf/tensor_mapping.py | 330 +++++++------- 3 files changed, 614 insertions(+), 1056 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index b037c50cb2000..eb6fe0ea42940 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -11,16 +11,7 @@ from abc import ABC, abstractmethod from enum import IntEnum from pathlib import Path -from typing import ( - TYPE_CHECKING, - Any, - Callable, - ContextManager, - Iterator, - Sequence, - TypeVar, - cast, -) +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast import numpy as np import torch @@ -28,14 +19,14 @@ if TYPE_CHECKING: from torch import Tensor -if "NO_LOCAL_GGUF" not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / "gguf-py")) +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf from convert import LlamaHfVocab, permute -###### MODEL DEFINITIONS ###### +###### MODEL DEFINITIONS ###### class SentencePieceTokenTypes(IntEnum): NORMAL = 1 @@ -52,31 +43,18 @@ class SentencePieceTokenTypes(IntEnum): class Model(ABC): _model_classes: dict[str, type[Model]] = {} - def __init__( - self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool - ): + def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool): self.dir_model = dir_model self.ftype = ftype self.fname_out = fname_out self.is_big_endian = is_big_endian - self.endianess = ( - gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE - ) + self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE self.is_safetensors = self._is_model_safetensors() - self.num_parts = Model.count_model_parts( - self.dir_model, ".safetensors" if self.is_safetensors else ".bin" - ) + self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") self.part_names = self._get_part_names() self.hparams = Model.load_hparams(self.dir_model) - self.gguf_writer = gguf.GGUFWriter( - fname_out, - gguf.MODEL_ARCH_NAMES[self.model_arch], - endianess=self.endianess, - use_temp_file=False, - ) - self.block_count = self.find_hparam( - ["n_layers", "num_hidden_layers", "n_layer"] - ) + self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False) + self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) @property @abstractmethod @@ -100,39 +78,20 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: ctx: ContextManager[Any] if self.is_safetensors: from safetensors import safe_open - - ctx = cast( - ContextManager[Any], - safe_open(self.dir_model / part_name, framework="pt", device="cpu"), - ) + ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) else: - ctx = contextlib.nullcontext( - torch.load( - str(self.dir_model / part_name), - map_location="cpu", - mmap=True, - weights_only=True, - ) - ) + ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) with ctx as model_part: for name in model_part.keys(): - data = ( - model_part.get_tensor(name) - if self.is_safetensors - else model_part[name] - ) + data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] yield name, data def set_gguf_parameters(self): self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_block_count(self.block_count) - if ( - n_ctx := self.find_hparam( - ["max_position_embeddings", "n_ctx"], optional=True - ) - ) is not None: + if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: self.gguf_writer.add_context_length(n_ctx) print(f"gguf: context length = {n_ctx}") @@ -140,9 +99,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(n_embd) print(f"gguf: embedding length = {n_embd}") - if ( - n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True) - ) is not None: + if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: self.gguf_writer.add_feed_forward_length(n_ff) print(f"gguf: feed forward length = {n_ff}") @@ -160,11 +117,7 @@ def set_gguf_parameters(self): if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) print(f"gguf: rms norm epsilon = {f_rms_eps}") - if ( - f_norm_eps := self.find_hparam( - ["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True - ) - ) is not None: + if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: self.gguf_writer.add_layer_norm_eps(f_norm_eps) print(f"gguf: layer norm epsilon = {f_norm_eps}") if (n_experts := self.hparams.get("num_local_experts")) is not None: @@ -178,20 +131,11 @@ def set_gguf_parameters(self): print(f"gguf: file type = {self.ftype}") def write_tensors(self): - block_count = self.hparams.get( - "n_layers", - self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), - ) + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith( - ( - ".attention.masked_bias", - ".attention.bias", - ".attention.rotary_emb.inv_freq", - ) - ): + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): continue old_dtype = data_torch.dtype @@ -216,21 +160,11 @@ def write_tensors(self): data = data.astype(np.float32) # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ( - self.ftype == 1 - and data_dtype == np.float16 - and (n_dims == 1 or new_name.endswith("_norm.weight")) - ): + if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - self.ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and not new_name.endswith("_norm.weight") - and n_dims == 2 - ): + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2: data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -271,7 +205,6 @@ def func(modelcls: type[Model]): for name in names: cls._model_classes[name] = modelcls return modelcls - return func @classmethod @@ -279,7 +212,7 @@ def from_model_architecture(cls, arch): try: return cls._model_classes[arch] except KeyError: - raise NotImplementedError(f"Architecture {arch!r} not supported!") from None + raise NotImplementedError(f'Architecture {arch!r} not supported!') from None def _is_model_safetensors(self) -> bool: return Model.count_model_parts(self.dir_model, ".safetensors") > 0 @@ -288,17 +221,11 @@ def _get_part_names(self): if self.is_safetensors: if self.num_parts == 1: # there's only one .safetensors file return ("model.safetensors",) - return ( - f"model-{n:05}-of-{self.num_parts:05}.safetensors" - for n in range(1, self.num_parts + 1) - ) + return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1)) if self.num_parts == 1: # there's only one .bin file return ("pytorch_model.bin",) - return ( - f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" - for n in range(1, self.num_parts + 1) - ) + return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) # used for GPT-2 BPE and WordPiece vocabs def get_basic_vocab(self) -> tuple[list[str], list[int]]: @@ -306,14 +233,11 @@ def get_basic_vocab(self) -> tuple[list[str], list[int]]: toktypes: list[int] = [] from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size - reverse_vocab = { - id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items() - } + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} added_vocab = tokenizer.get_added_vocab() for i in range(vocab_size): @@ -348,7 +272,6 @@ def _set_vocab_qwen(self): toktypes: list[int] = [] from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) vocab_size = hparams["vocab_size"] assert max(tokenizer.get_vocab().values()) < vocab_size @@ -362,13 +285,11 @@ def _set_vocab_qwen(self): continue merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) assert len(merged) == 2 - merges.append(" ".join(map(QwenModel.token_bytes_to_string, merged))) + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined added_vocab = tokenizer.special_tokens - reverse_vocab = { - id_: encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items() - } + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()} for i in range(vocab_size): if i not in reverse_vocab: @@ -389,22 +310,16 @@ def _set_vocab_qwen(self): special_vocab.merges = merges # only add special tokens when they were not already loaded from config.json if len(special_vocab.special_token_ids) == 0: - special_vocab._set_special_token( - "bos", tokenizer.special_tokens["<|endoftext|>"] - ) - special_vocab._set_special_token( - "eos", tokenizer.special_tokens["<|endoftext|>"] - ) + special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) # this one is usually not in config.json anyway - special_vocab._set_special_token( - "unk", tokenizer.special_tokens["<|endoftext|>"] - ) + special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) special_vocab.add_to_gguf(self.gguf_writer) def _set_vocab_sentencepiece(self): from sentencepiece import SentencePieceProcessor - tokenizer_path = self.dir_model / "tokenizer.model" + tokenizer_path = self.dir_model / 'tokenizer.model' tokens: list[bytes] = [] scores: list[float] = [] @@ -414,7 +329,7 @@ def _set_vocab_sentencepiece(self): raise FileNotFoundError(f"File not found: {tokenizer_path}") tokenizer = SentencePieceProcessor(str(tokenizer_path)) - vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size()) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) for token_id in range(tokenizer.vocab_size()): piece = tokenizer.id_to_piece(token_id) @@ -435,7 +350,7 @@ def _set_vocab_sentencepiece(self): scores.append(score) toktypes.append(toktype) - added_tokens_file = self.dir_model / "added_tokens.json" + added_tokens_file = self.dir_model / 'added_tokens.json' if added_tokens_file.is_file(): with open(added_tokens_file, "r", encoding="utf-8") as f: added_tokens_json = json.load(f) @@ -492,15 +407,10 @@ def set_gguf_parameters(self): self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_rope_dimension_count( - int( - self.hparams["rotary_pct"] - * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - ), + int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), ) self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual( - self.hparams.get("use_parallel_residual", True) - ) + self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) @@ -530,13 +440,10 @@ def write_tensors(self): n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) for name, data_torch in tensors.items(): - if ( - "lm_head.weight" not in tensors.keys() - and "output.weight" not in tensors.keys() - ): + if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys(): has_lm_head = False - name = re.sub(r"transformer\.", "", name) + name = re.sub(r'transformer\.', '', name) old_dtype = data_torch.dtype @@ -590,12 +497,7 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - self.ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}") @@ -604,10 +506,7 @@ def write_tensors(self): if not has_lm_head and name == "word_embeddings.weight": self.gguf_writer.add_tensor("output.weight", data) - print( - name, - f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}", - ) + print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") @Model.register("MPTForCausalLM") @@ -639,26 +538,16 @@ def set_gguf_parameters(self): if self.hparams["attn_config"]["clip_qkv"] is not None: self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) if self.hparams["attn_config"]["alibi"]: - self.gguf_writer.add_max_alibi_bias( - self.hparams["attn_config"]["alibi_bias_max"] - ) + self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) else: self.gguf_writer.add_max_alibi_bias(0.0) def write_tensors(self): - block_count = self.hparams.get( - "n_layers", self.hparams.get("num_hidden_layers") - ) + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers")) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith( - ( - ".attention.masked_bias", - ".attention.bias", - ".attention.rotary_emb.inv_freq", - ) - ): + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): continue old_dtype = data_torch.dtype @@ -671,9 +560,7 @@ def write_tensors(self): # map tensor names if "scales" in name: - new_name = tensor_map.get_name( - name, try_suffixes=(".weight", ".bias", ".scales") - ) + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales")) if new_name is not None: new_name = new_name.replace("scales", "act.scales") else: @@ -694,12 +581,7 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - self.ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -782,17 +664,10 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - self.ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print( - f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}" - ) + print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -827,22 +702,15 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count( - self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - ) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - if ( - self.hparams.get("rope_scaling") is not None - and "factor" in self.hparams["rope_scaling"] - ): + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: if self.hparams["rope_scaling"].get("type") == "linear": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor( - self.hparams["rope_scaling"]["factor"] - ) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) def write_tensors(self): # Collect tensors from generator object @@ -853,19 +721,14 @@ def write_tensors(self): head_count_kv = self.hparams.get("num_key_value_heads", head_count) for i in range(block_count): - if ( - w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight") - ) is not None: + if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None: print(f"Unpacking and permuting layer {i}") - model_kv[ - f"model.layers.{i}.self_attn.q_proj.weight" - ] = self._reverse_hf_permute_part(w, 0, head_count, head_count) - model_kv[ - f"model.layers.{i}.self_attn.k_proj.weight" - ] = self._reverse_hf_permute_part(w, 1, head_count, head_count_kv) - model_kv[ - f"model.layers.{i}.self_attn.v_proj.weight" - ] = self._reverse_hf_part(w, 2) + model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \ + self._reverse_hf_permute_part(w, 0, head_count, head_count) + model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \ + self._reverse_hf_permute_part(w, 1, head_count, head_count_kv) + model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \ + self._reverse_hf_part(w, 2) del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"] for name, data_torch in model_kv.items(): @@ -899,48 +762,31 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - self.ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print( - f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}" - ) + print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) - def _reverse_hf_permute( - self, weights: Tensor, n_head: int, n_kv_head: int | None = None - ) -> Tensor: + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head return ( - weights.reshape( - n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] - ) + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) .swapaxes(1, 2) .reshape(weights.shape) ) def _reverse_hf_permute_part( - self, - weights: Tensor, - n_part: int, - n_head: int, - n_head_kv: int | None = None, + self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, ) -> Tensor: r = weights.shape[0] // 3 - return self._reverse_hf_permute( - weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv - ) + return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: r = weights.shape[0] // 3 - return weights[r * n_part : r * n_part + r, ...] + return weights[r * n_part:r * n_part + r, ...] @Model.register("XverseForCausalLM") @@ -956,23 +802,20 @@ def set_vocab(self): toktypes: list[int] = [] from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size - reverse_vocab = { - id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items() - } + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} added_vocab = tokenizer.get_added_vocab() for token_id in range(vocab_size): - token_text = reverse_vocab[token_id].encode("utf-8") + token_text = reverse_vocab[token_id].encode('utf-8') # replace "\x00" to string with length > 0 if token_text == b"\x00": toktype = gguf.TokenType.BYTE # special - token_text = f"<{token_text}>".encode("utf-8") - elif re.fullmatch(rb"<0x[0-9A-Fa-f]{2}>", token_text): + token_text = f"<{token_text}>".encode('utf-8') + elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): toktype = gguf.TokenType.BYTE # special elif reverse_vocab[token_id] in added_vocab: if tokenizer.added_tokens_decoder[token_id].special: @@ -1016,22 +859,15 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count( - self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - ) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - if ( - self.hparams.get("rope_scaling") is not None - and "factor" in self.hparams["rope_scaling"] - ): + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: if self.hparams["rope_scaling"].get("type") == "linear": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor( - self.hparams["rope_scaling"]["factor"] - ) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) def write_tensors(self): # Collect tensors from generator object @@ -1054,13 +890,9 @@ def write_tensors(self): # HF models permute some of the tensors, so we need to undo that if name.endswith(("q_proj.weight")): - data_torch = self._reverse_hf_permute( - data_torch, head_count, head_count - ) + data_torch = self._reverse_hf_permute(data_torch, head_count, head_count) if name.endswith(("k_proj.weight")): - data_torch = self._reverse_hf_permute( - data_torch, head_count, head_count_kv - ) + data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv) data = data_torch.squeeze().numpy() @@ -1082,29 +914,18 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - self.ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print( - f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}" - ) + print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) - def _reverse_hf_permute( - self, weights: Tensor, n_head: int, n_kv_head: int | None = None - ) -> Tensor: + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head return ( - weights.reshape( - n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] - ) + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) .swapaxes(1, 2) .reshape(weights.shape) ) @@ -1172,9 +993,7 @@ def write_tensors(self): # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py if "query_key_value" in name: - qkv = data_torch.view( - n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head - ) + qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) @@ -1200,12 +1019,7 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - self.ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -1272,19 +1086,13 @@ def write_tensors(self): tensors = dict(self.get_tensors()) for i in range(block_count): if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None: - tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[ - : n_head_kv * head_dim - ] - tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[ - n_head_kv * head_dim : - ] + tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim] + tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:] del tensors[f"transformer.h.{i}.attn.kv.weight"] if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None: tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w del tensors[f"transformer.h.{i}.attn.q.weight"] - if ( - w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight") - ) is not None: + if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None: tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim] tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:] del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"] @@ -1316,12 +1124,7 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - self.ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -1334,14 +1137,12 @@ class PersimmonModel(Model): model_arch = gguf.MODEL_ARCH.PERSIMMON def set_gguf_parameters(self): - block_count = self.hparams.get( - "num_layers", self.hparams.get("num_hidden_layers") - ) + block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) head_count = self.hparams["num_attention_heads"] head_count_kv = head_count hidden_size = self.hparams["hidden_size"] - self.gguf_writer.add_name("persimmon-8b-chat") + self.gguf_writer.add_name('persimmon-8b-chat') self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hidden_size) self.gguf_writer.add_block_count(block_count) @@ -1364,9 +1165,7 @@ def set_vocab(self): # self.gguf_writer.add_eos_token_id(71013) def write_tensors(self): - block_count = self.hparams.get( - "num_layers", self.hparams.get("num_hidden_layers") - ) + block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data_torch in self.get_tensors(): @@ -1384,22 +1183,19 @@ def write_tensors(self): self.gguf_writer.add_tensor(new_name, data) -@Model.register( - "StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM" -) -class StableLM2Model(Model): +@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") +class StableLMModel(Model): model_arch = gguf.MODEL_ARCH.STABLELM - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - model_arch = ( + self.model_arch = ( gguf.MODEL_ARCH.STABLELM if self.hparams["num_hidden_layers"] < 40 else gguf.MODEL_ARCH.STABLELM2 ) self.gguf_writer = gguf.GGUFWriter( self.fname_out, - gguf.MODEL_ARCH_NAMES[model_arch], + gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False, ) @@ -1408,7 +1204,7 @@ def set_vocab(self): if (self.dir_model / "tokenizer.json").is_file(): self._set_vocab_gpt2() else: - # StableLM 2 uses a vocab in a similar format to Qwen's vocab + # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab self._set_vocab_qwen() def set_gguf_parameters(self): @@ -1422,22 +1218,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) - self.gguf_writer.add_rope_dimension_count( - int( - rotary_factor - * (hparams["hidden_size"] // hparams["num_attention_heads"]) - ) - ) + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) - self.gguf_writer.add_parallel_residual( - hparams["use_parallel_residual"] - if "use_parallel_residual" in hparams - else True - ) - self.gguf_writer.add_layer_norm_eps( - self.find_hparam(["layer_norm_eps", "norm_eps"]) - ) + self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) @Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") @@ -1446,7 +1231,7 @@ class LlamaModel(Model): def set_vocab(self): try: - self._set_vocab_sentencepiece() + self. _set_vocab_sentencepiece() except FileNotFoundError: self._set_vocab_llama_hf() @@ -1454,16 +1239,11 @@ def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_rope_dimension_count( - hparams["hidden_size"] // hparams["num_attention_heads"] - ) + self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) # Same as super class, but permuting q_proj, k_proj def write_tensors(self): - block_count = self.hparams.get( - "n_layers", - self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), - ) + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) n_head = self.hparams.get("num_attention_heads") n_kv_head = self.hparams.get("num_key_value_heads") @@ -1471,13 +1251,7 @@ def write_tensors(self): experts = dict() for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith( - ( - ".attention.masked_bias", - ".attention.bias", - ".attention.rotary_emb.inv_freq", - ) - ): + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): continue old_dtype = data_torch.dtype @@ -1526,20 +1300,14 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32: data = data.astype(np.float16) - merged_name = ( - f"layers.{bid}.feed_forward.experts.w{wid}.weight" - ) + merged_name = f"layers.{bid}.feed_forward.experts.w{wid}.weight" - new_name = tensor_map.get_name( - merged_name, try_suffixes=(".weight", ".bias") - ) + new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) if new_name is None: print(f"Can not map tensor {name!r}") sys.exit() - print( - f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}" - ) + print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) continue @@ -1562,12 +1330,7 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - self.ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -1593,22 +1356,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_name("Grok") def write_tensors(self): - block_count = self.hparams.get( - "n_layers", - self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), - ) + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) n_experts = self.hparams.get("num_local_experts") experts = dict() for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith( - ( - ".attention.masked_bias", - ".attention.bias", - ".attention.rotary_emb.inv_freq", - ) - ): + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): continue old_dtype = data_torch.dtype @@ -1650,20 +1404,14 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32: data = data.astype(np.float16) - merged_name = ( - f"transformer.decoder_layer.{bid}.moe.{wid}.weight" - ) + merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight" - new_name = tensor_map.get_name( - merged_name, try_suffixes=(".weight", ".bias") - ) + new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) if new_name is None: print(f"Can not map tensor {name!r}") sys.exit() - print( - f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}" - ) + print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) continue @@ -1686,12 +1434,7 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - self.ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -1806,9 +1549,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count( - self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - ) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) @@ -1817,37 +1558,24 @@ def set_gguf_parameters(self): def set_vocab(self): self._set_vocab_llama_hf() - def _reverse_hf_permute( - self, weights: Tensor, n_head: int, n_kv_head: int | None = None - ) -> Tensor: + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head return ( - weights.reshape( - n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] - ) + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) .swapaxes(1, 2) .reshape(weights.shape) ) def write_tensors(self): - block_count = self.hparams.get( - "n_layers", - self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), - ) + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) n_head = self.hparams.get("num_attention_heads") n_kv_head = self.hparams.get("num_key_value_heads") for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith( - ( - ".attention.masked_bias", - ".attention.bias", - ".attention.rotary_emb.inv_freq", - ) - ): + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): continue old_dtype = data_torch.dtype @@ -1882,12 +1610,7 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - self.ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -1902,14 +1625,11 @@ class QwenModel(Model): @staticmethod def token_bytes_to_string(b): from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode - byte_encoder = bytes_to_unicode() - return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")]) + return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) @staticmethod - def bpe( - mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None - ) -> list[bytes]: + def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: parts = [bytes([b]) for b in token] while True: min_idx = None @@ -1922,11 +1642,7 @@ def bpe( if min_rank is None or (max_rank is not None and min_rank >= max_rank): break assert min_idx is not None - parts = ( - parts[:min_idx] - + [parts[min_idx] + parts[min_idx + 1]] - + parts[min_idx + 2 :] - ) + parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] return parts def set_vocab(self): @@ -1939,9 +1655,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - self.gguf_writer.add_rope_dimension_count( - self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - ) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) @@ -1980,19 +1694,14 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - self.ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) -@Model.register("wen2ForCausalLM") +@Model.register("Qwen2ForCausalLM") class Qwen2Model(Model): model_arch = gguf.MODEL_ARCH.QWEN2 @@ -2012,28 +1721,15 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def write_tensors(self): - block_count = self.hparams.get( - "n_layers", - self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), - ) + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith( - ( - ".attention.masked_bias", - ".attention.bias", - ".attention.rotary_emb.inv_freq", - ".attn.bias", - ".attn.masked_bias", - ) - ): + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")): continue - if name.endswith( - (".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight") - ): + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): data_torch = data_torch.transpose(1, 0) old_dtype = data_torch.dtype @@ -2062,12 +1758,7 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - self.ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -2092,18 +1783,14 @@ def set_gguf_parameters(self): n_head = self.find_hparam(["num_attention_heads", "n_head"]) self.gguf_writer.add_name("Phi2") - self.gguf_writer.add_context_length( - self.find_hparam(["n_positions", "max_position_embeddings"]) - ) + self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) self.gguf_writer.add_embedding_length(n_embd) self.gguf_writer.add_feed_forward_length(4 * n_embd) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps( - self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]) - ) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"])) self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_add_bos_token(False) @@ -2126,9 +1813,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv( - 5 - ) # hparams["num_key_value_heads"]) is wrong + self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) def shuffle_attn_q_weight(self, data_torch): @@ -2146,9 +1831,7 @@ def shuffle_attn_output_weight(self, data_torch): return data_torch def write_tensors(self): - block_count = self.hparams.get( - "num_layers", self.hparams.get("num_hidden_layers") - ) + block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data_torch in self.get_tensors(): @@ -2187,12 +1870,7 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - self.ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -2221,15 +1899,10 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_factor(1.0) def write_tensors(self): - block_count = self.hparams.get( - "n_layers", - self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), - ) + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) tensors = dict(self.get_tensors()) - has_lm_head = ( - "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys() - ) + has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys() for name, data_torch in tensors.items(): # we don't need these if name.endswith((".attn.rotary_emb.inv_freq")): @@ -2261,12 +1934,7 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - self.ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -2275,10 +1943,7 @@ def write_tensors(self): if not has_lm_head and name == "transformer.wte.weight": self.gguf_writer.add_tensor("output.weight", data) - print( - name, - f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}", - ) + print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") @Model.register("InternLM2ForCausalLM") @@ -2293,14 +1958,14 @@ def set_vocab(self): from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model - tokenizer_path = self.dir_model / "tokenizer.model" + tokenizer_path = self.dir_model / 'tokenizer.model' tokens: list[bytes] = [] scores: list[float] = [] toktypes: list[int] = [] if not tokenizer_path.is_file(): - print(f"Error: Missing {tokenizer_path}", file=sys.stderr) + print(f'Error: Missing {tokenizer_path}', file=sys.stderr) sys.exit(1) sentencepiece_model = model.ModelProto() @@ -2308,7 +1973,7 @@ def set_vocab(self): add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix tokenizer = SentencePieceProcessor(str(tokenizer_path)) - vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size()) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) for token_id in range(vocab_size): piece = tokenizer.id_to_piece(token_id) @@ -2334,7 +1999,7 @@ def set_vocab(self): scores.append(score) toktypes.append(toktype) - added_tokens_file = self.dir_model / "added_tokens.json" + added_tokens_file = self.dir_model / 'added_tokens.json' if added_tokens_file.is_file(): with open(added_tokens_file, "r", encoding="utf-8") as f: added_tokens_json = json.load(f) @@ -2355,16 +2020,14 @@ def set_vocab(self): if "chat" in os.path.basename(self.dir_model.absolute()): # For the chat model, we replace the eos with '<|im_end|>'. special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) - print( - f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ -in chat mode so that the conversation can end normally." - ) + print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ +in chat mode so that the conversation can end normally.") special_vocab.add_to_gguf(self.gguf_writer) def _try_get_sft_eos(self, tokenizer): - unused_145_list = tokenizer.encode("[UNUSED_TOKEN_145]") - im_end_list = tokenizer.encode("<|im_end|>") + unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]') + im_end_list = tokenizer.encode('<|im_end|>') assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1) if len(unused_145_list) == 1: eos_token = unused_145_list[0] @@ -2375,13 +2038,9 @@ def _try_get_sft_eos(self, tokenizer): def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): if n_head_kv is not None and n_head != n_head_kv: n_head = n_head_kv - return ( - weights.reshape( - n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] - ) - .swapaxes(1, 2) - .reshape(weights.shape) - ) + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) def set_gguf_parameters(self): self.gguf_writer.add_name("InternLM2") @@ -2421,12 +2080,7 @@ def post_write_tensors(self, tensor_map, name, data_torch): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - self.ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -2454,35 +2108,15 @@ def write_tensors(self): if re.match(qkv_pattern, name): bid = re.findall(qkv_pattern, name)[0] qkv = data_torch - qkv = rearrange( - qkv.T, - " o (g n i) ->o g n i", - g=num_groups, - n=q_per_kv + 2, - i=head_dim, - ) - q, k, v = ( - qkv[..., :q_per_kv, :], - qkv[..., q_per_kv : q_per_kv + 1, :], - qkv[..., q_per_kv + 1 : q_per_kv + 2, :], - ) + qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) + q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] # The model weights of q and k equire additional reshape. - q = self._hf_permute_qk( - rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads - ) - k = self._hf_permute_qk( - rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads - ) + q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads) + k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads) v = rearrange(v, " o g n i -> o (g n i)").T - self.post_write_tensors( - tensor_map, f"model.layers.{bid}.attention.wq.weight", q - ) - self.post_write_tensors( - tensor_map, f"model.layers.{bid}.attention.wk.weight", k - ) - self.post_write_tensors( - tensor_map, f"model.layers.{bid}.attention.wv.weight", v - ) + self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q) + self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k) + self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v) else: self.post_write_tensors(tensor_map, name, data_torch) @@ -2512,9 +2146,7 @@ def set_gguf_parameters(self): # get pooling type if pooling_path is not None: - with open( - self.dir_model / pooling_path / "config.json", encoding="utf-8" - ) as f: + with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f: pooling = json.load(f) if pooling["pooling_mode_mean_tokens"]: pooling_type = gguf.PoolingType.MEAN @@ -2539,7 +2171,6 @@ def phantom(tok): if tok.startswith("##"): return tok[2:] return "\u2581" + tok - tokens = list(map(phantom, tokens)) # add vocab to gguf @@ -2556,11 +2187,7 @@ def write_tensors(self): tensors = dict(self.get_tensors()) for name, data_torch in tensors.items(): # we are only using BERT for embeddings so we don't need the pooling layer - if name in ( - "embeddings.position_ids", - "pooler.dense.weight", - "pooler.dense.bias", - ): + if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): continue # we don't need these # map tensor names @@ -2574,11 +2201,8 @@ def write_tensors(self): new_dtype: type[np.floating[Any]] if ( - self.ftype == 1 - and name.endswith(".weight") - and n_dims == 2 - and name - != "embeddings.token_type_embeddings.weight" # not used with get_rows, must be F32 + self.ftype == 1 and name.endswith(".weight") and n_dims == 2 + and name != "embeddings.token_type_embeddings.weight" # not used with get_rows, must be F32 ): # if f16 desired, convert any float32 2-dim weight tensors to float16 new_dtype = np.float16 @@ -2641,21 +2265,14 @@ def set_gguf_parameters(self): self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv( - self.hparams["num_key_value_heads"] - if "num_key_value_heads" in hparams - else hparams["num_attention_heads"] - ) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_key_length(hparams["head_dim"]) self.gguf_writer.add_value_length(hparams["head_dim"]) self.gguf_writer.add_file_type(self.ftype) def write_tensors(self): - block_count = self.hparams.get( - "n_layers", - self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), - ) + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data_torch in self.get_tensors(): @@ -2682,12 +2299,7 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - self.ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -2718,25 +2330,17 @@ def set_vocab(self): else: # Use the GPT-NeoX tokenizer when no tokenizer files are present tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf" - print( - f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'" - ) + print(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") neox_reader = gguf.GGUFReader(tokenizer_path, "r") field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL) self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1])) field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST) - self.gguf_writer.add_token_list( - [bytes(field.parts[i]) for i in field.data][:vocab_size] - ) + self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) - self.gguf_writer.add_token_types( - [field.parts[i].tolist()[0] for i in field.data][:vocab_size] - ) + self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES) - self.gguf_writer.add_token_merges( - [bytes(field.parts[i]) for i in field.data] - ) + self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID) self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0]) field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID) @@ -2746,37 +2350,23 @@ def set_vocab(self): def set_gguf_parameters(self): d_model = self.find_hparam(["hidden_size", "d_model"]) - d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 - d_inner = ( - self.find_hparam(["intermediate_size", "d_inner"], optional=True) - or 2 * d_model - ) + d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 + d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 # ceiling division # ref: https://stackoverflow.com/a/17511341/22827863 # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 - dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -( - d_model // -16 - ) - rms_norm_eps = ( - self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) - or 1e-5 - ) + dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) + rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 # Fail early for models which don't have a block expansion factor of 2 assert d_inner == 2 * d_model self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length( - 2**20 - ) # arbitrary value; for those who use the default + self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default self.gguf_writer.add_embedding_length(d_model) - self.gguf_writer.add_feed_forward_length( - 0 - ) # unused, but seemingly required when loading - self.gguf_writer.add_head_count( - 0 - ) # unused, but seemingly required when loading + self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading + self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading self.gguf_writer.add_block_count(self.hparams["n_layer"]) self.gguf_writer.add_ssm_conv_kernel(d_conv) self.gguf_writer.add_ssm_inner_size(d_inner) @@ -2791,7 +2381,7 @@ def write_tensors(self): tok_embd = None tok_embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD] + ".weight" - output_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight" + output_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight" for name, data_torch in self.get_tensors(): old_dtype = data_torch.dtype @@ -2832,17 +2422,8 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert big float32 2-dim weight tensors to float16 - new_weight_name = ( - new_name[: -len(".weight")] if new_name.endswith(".weight") else "" - ) - if ( - self.ftype == 1 - and data_dtype == np.float32 - and new_weight_name.endswith( - (".ssm_in", ".ssm_out", "token_embd", "output") - ) - and n_dims == 2 - ): + new_weight_name = new_name[:-len(".weight")] if new_name.endswith(".weight") else "" + if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2: data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -2872,36 +2453,25 @@ def set_gguf_parameters(self): def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="Convert a huggingface model to a GGML compatible file" - ) + description="Convert a huggingface model to a GGML compatible file") parser.add_argument( - "--vocab-only", - action="store_true", + "--vocab-only", action="store_true", help="extract only the vocab", ) parser.add_argument( - "--awq-path", type=Path, default=None, help="Path to scale awq cache file" - ) + "--awq-path", type=Path, default=None, + help="Path to scale awq cache file") parser.add_argument( - "--outfile", - type=Path, + "--outfile", type=Path, help="path to write to; default: based on input", ) parser.add_argument( - "--outtype", - type=str, - choices=["f32", "f16"], - default="f16", + "--outtype", type=str, choices=["f32", "f16"], default="f16", help="output format - use f32 for float32, f16 for float16", ) + parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") parser.add_argument( - "--bigendian", - action="store_true", - help="model is executed on big endian machine", - ) - parser.add_argument( - "model", - type=Path, + "model", type=Path, help="directory containing model file", ) @@ -2914,9 +2484,8 @@ def main() -> None: dir_model = args.model if args.awq_path: - sys.path.insert(1, str(Path(__file__).parent / "awq-py")) + sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] - tmp_model_path = args.model / "weighted_model" dir_model = tmp_model_path if tmp_model_path.is_dir(): @@ -2928,18 +2497,19 @@ def main() -> None: print(f"Saved weighted model at {tmp_model_path}.") if not dir_model.is_dir(): - print(f"Error: {args.model} is not a directory", file=sys.stderr) + print(f'Error: {args.model} is not a directory', file=sys.stderr) sys.exit(1) ftype_map = { "f32": gguf.GGMLQuantizationType.F32, "f16": gguf.GGMLQuantizationType.F16, } + if args.outfile is not None: fname_out = args.outfile else: # output in the same directory as the model by default - fname_out = dir_model / f"ggml-model-{args.outtype}.gguf" + fname_out = dir_model / f'ggml-model-{args.outtype}.gguf' print(f"Loading model: {dir_model.name}") @@ -2947,9 +2517,7 @@ def main() -> None: with torch.inference_mode(): model_class = Model.from_model_architecture(hparams["architectures"][0]) - model_instance = model_class( - dir_model, ftype_map[args.outtype], fname_out, args.bigendian - ) + model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian) print("Set model parameters") model_instance.set_gguf_parameters() @@ -2967,5 +2535,5 @@ def main() -> None: print(f"Model successfully exported to '{fname_out}'") -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b5248b7497dbd..bff325c690130 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -8,8 +8,8 @@ # constants # -GGUF_MAGIC = 0x46554747 # "GGUF" -GGUF_VERSION = 3 +GGUF_MAGIC = 0x46554747 # "GGUF" +GGUF_VERSION = 3 GGUF_DEFAULT_ALIGNMENT = 32 # @@ -19,79 +19,77 @@ class Keys: class General: - ARCHITECTURE = "general.architecture" + ARCHITECTURE = "general.architecture" QUANTIZATION_VERSION = "general.quantization_version" - ALIGNMENT = "general.alignment" - NAME = "general.name" - AUTHOR = "general.author" - VERSION = "general.version" - URL = "general.url" - DESCRIPTION = "general.description" - LICENSE = "general.license" - SOURCE_URL = "general.source.url" - SOURCE_HF_REPO = "general.source.huggingface.repository" - FILE_TYPE = "general.file_type" + ALIGNMENT = "general.alignment" + NAME = "general.name" + AUTHOR = "general.author" + VERSION = "general.version" + URL = "general.url" + DESCRIPTION = "general.description" + LICENSE = "general.license" + SOURCE_URL = "general.source.url" + SOURCE_HF_REPO = "general.source.huggingface.repository" + FILE_TYPE = "general.file_type" class LLM: - VOCAB_SIZE = "{arch}.vocab_size" - CONTEXT_LENGTH = "{arch}.context_length" - EMBEDDING_LENGTH = "{arch}.embedding_length" - BLOCK_COUNT = "{arch}.block_count" - FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" + VOCAB_SIZE = "{arch}.vocab_size" + CONTEXT_LENGTH = "{arch}.context_length" + EMBEDDING_LENGTH = "{arch}.embedding_length" + BLOCK_COUNT = "{arch}.block_count" + FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" - TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" - EXPERT_COUNT = "{arch}.expert_count" - EXPERT_USED_COUNT = "{arch}.expert_used_count" - POOLING_TYPE = "{arch}.pooling_type" - LOGIT_SCALE = "{arch}.logit_scale" + TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" + EXPERT_COUNT = "{arch}.expert_count" + EXPERT_USED_COUNT = "{arch}.expert_used_count" + POOLING_TYPE = "{arch}.pooling_type" + LOGIT_SCALE = "{arch}.logit_scale" class Attention: - HEAD_COUNT = "{arch}.attention.head_count" - HEAD_COUNT_KV = "{arch}.attention.head_count_kv" - MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" - CLAMP_KQV = "{arch}.attention.clamp_kqv" - KEY_LENGTH = "{arch}.attention.key_length" - VALUE_LENGTH = "{arch}.attention.value_length" - LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" + HEAD_COUNT = "{arch}.attention.head_count" + HEAD_COUNT_KV = "{arch}.attention.head_count_kv" + MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" + CLAMP_KQV = "{arch}.attention.clamp_kqv" + KEY_LENGTH = "{arch}.attention.key_length" + VALUE_LENGTH = "{arch}.attention.value_length" + LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" - CAUSAL = "{arch}.attention.causal" + CAUSAL = "{arch}.attention.causal" class Rope: - DIMENSION_COUNT = "{arch}.rope.dimension_count" - FREQ_BASE = "{arch}.rope.freq_base" - SCALING_TYPE = "{arch}.rope.scaling.type" - SCALING_FACTOR = "{arch}.rope.scaling.factor" + DIMENSION_COUNT = "{arch}.rope.dimension_count" + FREQ_BASE = "{arch}.rope.freq_base" + SCALING_TYPE = "{arch}.rope.scaling.type" + SCALING_FACTOR = "{arch}.rope.scaling.factor" SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" - SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" + SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" class SSM: - CONV_KERNEL = "{arch}.ssm.conv_kernel" - INNER_SIZE = "{arch}.ssm.inner_size" - STATE_SIZE = "{arch}.ssm.state_size" + CONV_KERNEL = "{arch}.ssm.conv_kernel" + INNER_SIZE = "{arch}.ssm.inner_size" + STATE_SIZE = "{arch}.ssm.state_size" TIME_STEP_RANK = "{arch}.ssm.time_step_rank" class Tokenizer: - MODEL = "tokenizer.ggml.model" - LIST = "tokenizer.ggml.tokens" - TOKEN_TYPE = "tokenizer.ggml.token_type" - TOKEN_TYPE_COUNT = ( - "tokenizer.ggml.token_type_count" # for BERT-style token types - ) - SCORES = "tokenizer.ggml.scores" - MERGES = "tokenizer.ggml.merges" - BOS_ID = "tokenizer.ggml.bos_token_id" - EOS_ID = "tokenizer.ggml.eos_token_id" - UNK_ID = "tokenizer.ggml.unknown_token_id" - SEP_ID = "tokenizer.ggml.seperator_token_id" - PAD_ID = "tokenizer.ggml.padding_token_id" - CLS_ID = "tokenizer.ggml.cls_token_id" - MASK_ID = "tokenizer.ggml.mask_token_id" - ADD_BOS = "tokenizer.ggml.add_bos_token" - ADD_EOS = "tokenizer.ggml.add_eos_token" - ADD_PREFIX = "tokenizer.ggml.add_space_prefix" - HF_JSON = "tokenizer.huggingface.json" - RWKV = "tokenizer.rwkv.world" - CHAT_TEMPLATE = "tokenizer.chat_template" + MODEL = "tokenizer.ggml.model" + LIST = "tokenizer.ggml.tokens" + TOKEN_TYPE = "tokenizer.ggml.token_type" + TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types + SCORES = "tokenizer.ggml.scores" + MERGES = "tokenizer.ggml.merges" + BOS_ID = "tokenizer.ggml.bos_token_id" + EOS_ID = "tokenizer.ggml.eos_token_id" + UNK_ID = "tokenizer.ggml.unknown_token_id" + SEP_ID = "tokenizer.ggml.seperator_token_id" + PAD_ID = "tokenizer.ggml.padding_token_id" + CLS_ID = "tokenizer.ggml.cls_token_id" + MASK_ID = "tokenizer.ggml.mask_token_id" + ADD_BOS = "tokenizer.ggml.add_bos_token" + ADD_EOS = "tokenizer.ggml.add_eos_token" + ADD_PREFIX = "tokenizer.ggml.add_space_prefix" + HF_JSON = "tokenizer.huggingface.json" + RWKV = "tokenizer.rwkv.world" + CHAT_TEMPLATE = "tokenizer.chat_template" # @@ -133,111 +131,111 @@ class MODEL_ARCH(IntEnum): class MODEL_TENSOR(IntEnum): - TOKEN_EMBD = auto() + TOKEN_EMBD = auto() TOKEN_EMBD_NORM = auto() - TOKEN_TYPES = auto() - POS_EMBD = auto() - OUTPUT = auto() - OUTPUT_NORM = auto() - ROPE_FREQS = auto() - ATTN_Q = auto() - ATTN_K = auto() - ATTN_V = auto() - ATTN_QKV = auto() - ATTN_OUT = auto() - ATTN_NORM = auto() - ATTN_NORM_2 = auto() - ATTN_OUT_NORM = auto() - ATTN_ROT_EMBD = auto() - FFN_GATE_INP = auto() - FFN_NORM = auto() - FFN_GATE = auto() - FFN_DOWN = auto() - FFN_UP = auto() - FFN_ACT = auto() - FFN_GATE_EXP = auto() - FFN_DOWN_EXP = auto() - FFN_UP_EXP = auto() - ATTN_Q_NORM = auto() - ATTN_K_NORM = auto() - LAYER_OUT_NORM = auto() - SSM_IN = auto() - SSM_CONV1D = auto() - SSM_X = auto() - SSM_DT = auto() - SSM_A = auto() - SSM_D = auto() - SSM_OUT = auto() + TOKEN_TYPES = auto() + POS_EMBD = auto() + OUTPUT = auto() + OUTPUT_NORM = auto() + ROPE_FREQS = auto() + ATTN_Q = auto() + ATTN_K = auto() + ATTN_V = auto() + ATTN_QKV = auto() + ATTN_OUT = auto() + ATTN_NORM = auto() + ATTN_NORM_2 = auto() + ATTN_OUT_NORM = auto() + ATTN_ROT_EMBD = auto() + FFN_GATE_INP = auto() + FFN_NORM = auto() + FFN_GATE = auto() + FFN_DOWN = auto() + FFN_UP = auto() + FFN_ACT = auto() + FFN_GATE_EXP = auto() + FFN_DOWN_EXP = auto() + FFN_UP_EXP = auto() + ATTN_Q_NORM = auto() + ATTN_K_NORM = auto() + LAYER_OUT_NORM = auto() + SSM_IN = auto() + SSM_CONV1D = auto() + SSM_X = auto() + SSM_DT = auto() + SSM_A = auto() + SSM_D = auto() + SSM_OUT = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { - MODEL_ARCH.LLAMA: "llama", - MODEL_ARCH.FALCON: "falcon", - MODEL_ARCH.BAICHUAN: "baichuan", - MODEL_ARCH.GROK: "grok", - MODEL_ARCH.GPT2: "gpt2", - MODEL_ARCH.GPTJ: "gptj", - MODEL_ARCH.GPTNEOX: "gptneox", - MODEL_ARCH.MPT: "mpt", - MODEL_ARCH.STARCODER: "starcoder", - MODEL_ARCH.PERSIMMON: "persimmon", - MODEL_ARCH.REFACT: "refact", - MODEL_ARCH.BERT: "bert", - MODEL_ARCH.NOMIC_BERT: "nomic-bert", - MODEL_ARCH.BLOOM: "bloom", - MODEL_ARCH.STABLELM: "stablelm", - MODEL_ARCH.STABLELM2: "stablelm2", - MODEL_ARCH.QWEN: "qwen", - MODEL_ARCH.QWEN2: "qwen2", - MODEL_ARCH.PHI2: "phi2", - MODEL_ARCH.PLAMO: "plamo", - MODEL_ARCH.CODESHELL: "codeshell", - MODEL_ARCH.ORION: "orion", - MODEL_ARCH.INTERNLM2: "internlm2", - MODEL_ARCH.MINICPM: "minicpm", - MODEL_ARCH.GEMMA: "gemma", - MODEL_ARCH.STARCODER2: "starcoder2", - MODEL_ARCH.MAMBA: "mamba", - MODEL_ARCH.XVERSE: "xverse", - MODEL_ARCH.COMMAND_R: "command-r", + MODEL_ARCH.LLAMA: "llama", + MODEL_ARCH.FALCON: "falcon", + MODEL_ARCH.BAICHUAN: "baichuan", + MODEL_ARCH.GROK: "grok", + MODEL_ARCH.GPT2: "gpt2", + MODEL_ARCH.GPTJ: "gptj", + MODEL_ARCH.GPTNEOX: "gptneox", + MODEL_ARCH.MPT: "mpt", + MODEL_ARCH.STARCODER: "starcoder", + MODEL_ARCH.PERSIMMON: "persimmon", + MODEL_ARCH.REFACT: "refact", + MODEL_ARCH.BERT: "bert", + MODEL_ARCH.NOMIC_BERT: "nomic-bert", + MODEL_ARCH.BLOOM: "bloom", + MODEL_ARCH.STABLELM: "stablelm", + MODEL_ARCH.STABLELM2: "stablelm2", + MODEL_ARCH.QWEN: "qwen", + MODEL_ARCH.QWEN2: "qwen2", + MODEL_ARCH.PHI2: "phi2", + MODEL_ARCH.PLAMO: "plamo", + MODEL_ARCH.CODESHELL: "codeshell", + MODEL_ARCH.ORION: "orion", + MODEL_ARCH.INTERNLM2: "internlm2", + MODEL_ARCH.MINICPM: "minicpm", + MODEL_ARCH.GEMMA: "gemma", + MODEL_ARCH.STARCODER2: "starcoder2", + MODEL_ARCH.MAMBA: "mamba", + MODEL_ARCH.XVERSE: "xverse", + MODEL_ARCH.COMMAND_R: "command-r", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.TOKEN_EMBD: "token_embd", MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", - MODEL_TENSOR.TOKEN_TYPES: "token_types", - MODEL_TENSOR.POS_EMBD: "position_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.ROPE_FREQS: "rope_freqs", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", - MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", - MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", - MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", - MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", - MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", - MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", - MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", - MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", - MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", - MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", - MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", - MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", - MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", - MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", - MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", - MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", - MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", - MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", - MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", - MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", + MODEL_TENSOR.TOKEN_TYPES: "token_types", + MODEL_TENSOR.POS_EMBD: "position_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ROPE_FREQS: "rope_freqs", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", + MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", + MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", + MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", + MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", + MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", + MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", + MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", + MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", + MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", + MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", + MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", + MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", + MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", + MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", + MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", + MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", + MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -443,8 +441,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, - MODEL_TENSOR.ATTN_Q_NORM, - MODEL_TENSOR.ATTN_K_NORM, ], MODEL_ARCH.STABLELM2: [ MODEL_TENSOR.TOKEN_EMBD, @@ -722,55 +718,55 @@ class MODEL_TENSOR(IntEnum): class TokenType(IntEnum): - NORMAL = 1 - UNKNOWN = 2 - CONTROL = 3 + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 USER_DEFINED = 4 - UNUSED = 5 - BYTE = 6 + UNUSED = 5 + BYTE = 6 class RopeScalingType(Enum): - NONE = "none" - LINEAR = "linear" - YARN = "yarn" + NONE = 'none' + LINEAR = 'linear' + YARN = 'yarn' class PoolingType(IntEnum): NONE = 0 MEAN = 1 - CLS = 2 + CLS = 2 class GGMLQuantizationType(IntEnum): - F32 = 0 - F16 = 1 - Q4_0 = 2 - Q4_1 = 3 - Q5_0 = 6 - Q5_1 = 7 - Q8_0 = 8 - Q8_1 = 9 - Q2_K = 10 - Q3_K = 11 - Q4_K = 12 - Q5_K = 13 - Q6_K = 14 - Q8_K = 15 + F32 = 0 + F16 = 1 + Q4_0 = 2 + Q4_1 = 3 + Q5_0 = 6 + Q5_1 = 7 + Q8_0 = 8 + Q8_1 = 9 + Q2_K = 10 + Q3_K = 11 + Q4_K = 12 + Q5_K = 13 + Q6_K = 14 + Q8_K = 15 IQ2_XXS = 16 - IQ2_XS = 17 + IQ2_XS = 17 IQ3_XXS = 18 - IQ1_S = 19 - IQ4_NL = 20 - IQ3_S = 21 - IQ2_S = 22 - IQ4_XS = 23 - I8 = 24 - I16 = 25 - I32 = 26 - I64 = 27 - F64 = 28 - IQ1_M = 29 + IQ1_S = 19 + IQ4_NL = 20 + IQ3_S = 21 + IQ2_S = 22 + IQ4_XS = 23 + I8 = 24 + I16 = 25 + I32 = 26 + I64 = 27 + F64 = 28 + IQ1_M = 29 class GGUFEndian(IntEnum): @@ -779,18 +775,18 @@ class GGUFEndian(IntEnum): class GGUFValueType(IntEnum): - UINT8 = 0 - INT8 = 1 - UINT16 = 2 - INT16 = 3 - UINT32 = 4 - INT32 = 5 + UINT8 = 0 + INT8 = 1 + UINT16 = 2 + INT16 = 3 + UINT32 = 4 + INT32 = 5 FLOAT32 = 6 - BOOL = 7 - STRING = 8 - ARRAY = 9 - UINT64 = 10 - INT64 = 11 + BOOL = 7 + STRING = 8 + ARRAY = 9 + UINT64 = 10 + INT64 = 11 FLOAT64 = 12 @staticmethod @@ -815,94 +811,94 @@ def get_type(val: Any) -> GGUFValueType: QK_K = 256 # Items here are (block size, type size) GGML_QUANT_SIZES = { - GGMLQuantizationType.F32: (1, 4), - GGMLQuantizationType.F16: (1, 2), - GGMLQuantizationType.Q4_0: (32, 2 + 16), - GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), - GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), - GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), - GGMLQuantizationType.Q8_0: (32, 2 + 32), - GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), - GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), - GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), - GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), - GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), - GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), - GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), + GGMLQuantizationType.F32: (1, 4), + GGMLQuantizationType.F16: (1, 2), + GGMLQuantizationType.Q4_0: (32, 2 + 16), + GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), + GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), + GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), + GGMLQuantizationType.Q8_0: (32, 2 + 32), + GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), + GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), + GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), + GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), + GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), + GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), + GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4), - GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32), + GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32), GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8), - GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16), - GGMLQuantizationType.IQ4_NL: (32, 2 + 16), - GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), - GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), - GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), - GGMLQuantizationType.I8: (1, 1), - GGMLQuantizationType.I16: (1, 2), - GGMLQuantizationType.I32: (1, 4), - GGMLQuantizationType.I64: (1, 8), - GGMLQuantizationType.F64: (1, 8), + GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16), + GGMLQuantizationType.IQ4_NL: (32, 2 + 16), + GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), + GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), + GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), + GGMLQuantizationType.I8: (1, 1), + GGMLQuantizationType.I16: (1, 2), + GGMLQuantizationType.I32: (1, 4), + GGMLQuantizationType.I64: (1, 8), + GGMLQuantizationType.F64: (1, 8), } # Aliases for backward compatibility. # general -KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE +KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION -KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT -KEY_GENERAL_NAME = Keys.General.NAME -KEY_GENERAL_AUTHOR = Keys.General.AUTHOR -KEY_GENERAL_URL = Keys.General.URL -KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION -KEY_GENERAL_LICENSE = Keys.General.LICENSE -KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL -KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO -KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE +KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT +KEY_GENERAL_NAME = Keys.General.NAME +KEY_GENERAL_AUTHOR = Keys.General.AUTHOR +KEY_GENERAL_URL = Keys.General.URL +KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION +KEY_GENERAL_LICENSE = Keys.General.LICENSE +KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL +KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO +KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE # LLM -KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE -KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH -KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH -KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT -KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH +KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE +KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH +KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH +KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT +KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL -KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT +KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT # attention -KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT -KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV -KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS -KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV -KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS +KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT +KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV +KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS +KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV +KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS # RoPE -KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT -KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE -KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE -KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR +KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT +KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE +KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE +KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN -KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED +KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED # SSM -KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL -KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE -KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE +KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL +KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE +KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK # tokenization -KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL -KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST +KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL +KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE -KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES -KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES -KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID -KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID -KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID -KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID -KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID -KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID -KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID -KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON -KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV +KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES +KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES +KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID +KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID +KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID +KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID +KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID +KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID +KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID +KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON +KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index f83d27ff75bb6..81ed5ccba9188 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -17,38 +17,43 @@ class TensorNameMap: "tok_embeddings", # llama-pth "embeddings.word_embeddings", # bert nomic-bert "language_model.embedding.word_embeddings", # persimmon - "wte", # gpt2 - "transformer.embd.wte", # phi2 - "model.tok_embeddings", # internlm2 - "model.embedding", # mamba-qbert - "backbone.embedding", # mamba - "backbone.embeddings", # mamba-hf - "transformer.in_out_embed", # Grok + "wte", # gpt2 + "transformer.embd.wte", # phi2 + "model.tok_embeddings", # internlm2 + "model.embedding", # mamba-qbert + "backbone.embedding", # mamba + "backbone.embeddings", # mamba-hf + "transformer.in_out_embed", # Grok ), + # Token type embeddings MODEL_TENSOR.TOKEN_TYPES: ( "embeddings.token_type_embeddings", # bert nomic-bert ), + # Normalization of token embeddings MODEL_TENSOR.TOKEN_EMBD_NORM: ( "word_embeddings_layernorm", # bloom - "embeddings.LayerNorm", # bert - "emb_ln", # nomic-bert + "embeddings.LayerNorm", # bert + "emb_ln", # nomic-bert ), + # Position embeddings MODEL_TENSOR.POS_EMBD: ( - "transformer.wpe", # gpt2 + "transformer.wpe", # gpt2 "embeddings.position_embeddings", # bert - "wpe", # gpt2 + "wpe", # gpt2 ), + # Output MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon - "lm_head.linear", # phi2 + "lm_head.linear", # phi2 ), + # Output norm MODEL_TENSOR.OUTPUT_NORM: ( "gpt_neox.final_layer_norm", # gptneox @@ -58,27 +63,30 @@ class TensorNameMap: "transformer.norm_f", # mpt dbrx "ln_f", # refact bloom qwen gpt2 "language_model.encoder.final_layernorm", # persimmon - "model.final_layernorm", # persimmon - "lm_head.ln", # phi2 - "model.norm_f", # mamba-qbert - "backbone.norm_f", # mamba - "transformer.rms_norm", # Grok + "model.final_layernorm", # persimmon + "lm_head.ln", # phi2 + "model.norm_f", # mamba-qbert + "backbone.norm_f", # mamba + "transformer.rms_norm", # Grok ), + # Rope frequencies - MODEL_TENSOR.ROPE_FREQS: ("rope.freqs",), # llama-pth + MODEL_TENSOR.ROPE_FREQS: ( + "rope.freqs", # llama-pth + ), } block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { # Attention norm MODEL_TENSOR.ATTN_NORM: ( - "gpt_neox.layers.{bid}.input_layernorm", # gptneox - "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen - "transformer.blocks.{bid}.norm_1", # mpt - "transformer.h.{bid}.input_layernorm", # falcon7b - "h.{bid}.input_layernorm", # bloom - "transformer.h.{bid}.ln_mlp", # falcon40b - "model.layers.{bid}.input_layernorm", # llama-hf - "layers.{bid}.attention_norm", # llama-pth + "gpt_neox.layers.{bid}.input_layernorm", # gptneox + "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen + "transformer.blocks.{bid}.norm_1", # mpt + "transformer.h.{bid}.input_layernorm", # falcon7b + "h.{bid}.input_layernorm", # bloom + "transformer.h.{bid}.ln_mlp", # falcon40b + "model.layers.{bid}.input_layernorm", # llama-hf + "layers.{bid}.attention_norm", # llama-pth "language_model.encoder.layers.{bid}.input_layernorm", # persimmon "model.layers.{bid}.ln1", # yi "h.{bid}.ln_1", # gpt2 @@ -90,8 +98,12 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.rms_norm", # Grok "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx ), + # Attention norm 2 - MODEL_TENSOR.ATTN_NORM_2: ("transformer.h.{bid}.ln_attn",), # falcon40b + MODEL_TENSOR.ATTN_NORM_2: ( + "transformer.h.{bid}.ln_attn", # falcon40b + ), + # Attention query-key-value MODEL_TENSOR.ATTN_QKV: ( "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox @@ -101,41 +113,45 @@ class TensorNameMap: "transformer.h.{bid}.self_attention.query_key_value", # falcon "h.{bid}.self_attention.query_key_value", # bloom "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon - "model.layers.{bid}.self_attn.query_key_value", # persimmon - "h.{bid}.attn.c_attn", # gpt2 - "transformer.h.{bid}.mixer.Wqkv", # phi2 - "encoder.layers.{bid}.attn.Wqkv", # nomic-bert + "model.layers.{bid}.self_attn.query_key_value", # persimmon + "h.{bid}.attn.c_attn", # gpt2 + "transformer.h.{bid}.mixer.Wqkv", # phi2 + "encoder.layers.{bid}.attn.Wqkv", # nomic-bert ), + # Attention query MODEL_TENSOR.ATTN_Q: ( - "model.layers.{bid}.self_attn.q_proj", # llama-hf - "layers.{bid}.attention.wq", # llama-pth - "encoder.layer.{bid}.attention.self.query", # bert - "transformer.h.{bid}.attn.q_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.q_proj", # plamo - "model.layers.{bid}.attention.wq", # internlm2 - "transformer.decoder_layer.{bid}.multi_head_attention.query", # Grok + "model.layers.{bid}.self_attn.q_proj", # llama-hf + "layers.{bid}.attention.wq", # llama-pth + "encoder.layer.{bid}.attention.self.query", # bert + "transformer.h.{bid}.attn.q_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.q_proj", # plamo + "model.layers.{bid}.attention.wq", # internlm2 + "transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok ), + # Attention key MODEL_TENSOR.ATTN_K: ( - "model.layers.{bid}.self_attn.k_proj", # llama-hf - "layers.{bid}.attention.wk", # llama-pth - "encoder.layer.{bid}.attention.self.key", # bert - "transformer.h.{bid}.attn.k_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.k_proj", # plamo - "model.layers.{bid}.attention.wk", # internlm2 - "transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok + "model.layers.{bid}.self_attn.k_proj", # llama-hf + "layers.{bid}.attention.wk", # llama-pth + "encoder.layer.{bid}.attention.self.key", # bert + "transformer.h.{bid}.attn.k_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.k_proj", # plamo + "model.layers.{bid}.attention.wk", # internlm2 + "transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok ), + # Attention value MODEL_TENSOR.ATTN_V: ( - "model.layers.{bid}.self_attn.v_proj", # llama-hf - "layers.{bid}.attention.wv", # llama-pth - "encoder.layer.{bid}.attention.self.value", # bert - "transformer.h.{bid}.attn.v_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.v_proj", # plamo - "model.layers.{bid}.attention.wv", # internlm2 - "transformer.decoder_layer.{bid}.multi_head_attention.value", # Grok + "model.layers.{bid}.self_attn.v_proj", # llama-hf + "layers.{bid}.attention.wv", # llama-pth + "encoder.layer.{bid}.attention.self.value", # bert + "transformer.h.{bid}.attn.v_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.v_proj", # plamo + "model.layers.{bid}.attention.wv", # internlm2 + "transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok ), + # Attention output MODEL_TENSOR.ATTN_OUT: ( "gpt_neox.layers.{bid}.attention.dense", # gptneox @@ -164,71 +180,81 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.rms_norm_1", # Grok "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx ), + # Rotary embeddings MODEL_TENSOR.ATTN_ROT_EMBD: ( - "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf - "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth - "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo - "transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell + "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf + "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth + "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo + "transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell ), + # Feed-forward norm MODEL_TENSOR.FFN_NORM: ( - "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox - "transformer.h.{bid}.ln_2", # gpt2 refact qwen - "h.{bid}.post_attention_layernorm", # bloom - "transformer.blocks.{bid}.norm_2", # mpt - "model.layers.{bid}.post_attention_layernorm", # llama-hf - "layers.{bid}.ffn_norm", # llama-pth + "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox + "transformer.h.{bid}.ln_2", # gpt2 refact qwen + "h.{bid}.post_attention_layernorm", # bloom + "transformer.blocks.{bid}.norm_2", # mpt + "model.layers.{bid}.post_attention_layernorm", # llama-hf + "layers.{bid}.ffn_norm", # llama-pth "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon - "model.layers.{bid}.ln2", # yi - "h.{bid}.ln_2", # gpt2 - "model.layers.{bid}.ffn_norm", # internlm2 - "transformer.decoder_layer.{bid}.rms_norm_2", # Grok + "model.layers.{bid}.ln2", # yi + "h.{bid}.ln_2", # gpt2 + "model.layers.{bid}.ffn_norm", # internlm2 + "transformer.decoder_layer.{bid}.rms_norm_2", # Grok ), + MODEL_TENSOR.FFN_GATE_INP: ( "layers.{bid}.feed_forward.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral "transformer.decoder_layer.{bid}.router", # Grok "transformer.blocks.{bid}.ffn.router.layer", # dbrx ), + # Feed-forward up MODEL_TENSOR.FFN_UP: ( - "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox - "transformer.h.{bid}.mlp.c_fc", # gpt2 - "transformer.blocks.{bid}.ffn.up_proj", # mpt - "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon - "h.{bid}.mlp.dense_h_to_4h", # bloom - "model.layers.{bid}.mlp.up_proj", # llama-hf refact - "layers.{bid}.feed_forward.w3", # llama-pth - "encoder.layer.{bid}.intermediate.dense", # bert - "transformer.h.{bid}.mlp.fc_in", # gpt-j + "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox + "transformer.h.{bid}.mlp.c_fc", # gpt2 + "transformer.blocks.{bid}.ffn.up_proj", # mpt + "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon + "h.{bid}.mlp.dense_h_to_4h", # bloom + "model.layers.{bid}.mlp.up_proj", # llama-hf refact + "layers.{bid}.feed_forward.w3", # llama-pth + "encoder.layer.{bid}.intermediate.dense", # bert + "transformer.h.{bid}.mlp.fc_in", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon - "model.layers.{bid}.mlp.dense_h_to_4h", # persimmon - "transformer.h.{bid}.mlp.w1", # qwen - "h.{bid}.mlp.c_fc", # gpt2 - "transformer.h.{bid}.mlp.fc1", # phi2 - "model.layers.{bid}.mlp.fc1", # phi2 - "model.layers.layers.{bid}.mlp.up_proj", # plamo - "model.layers.{bid}.feed_forward.w3", # internlm2 - "encoder.layers.{bid}.mlp.fc11", # nomic-bert - "model.layers.{bid}.mlp.c_fc", # starcoder2 + "model.layers.{bid}.mlp.dense_h_to_4h", # persimmon + "transformer.h.{bid}.mlp.w1", # qwen + "h.{bid}.mlp.c_fc", # gpt2 + "transformer.h.{bid}.mlp.fc1", # phi2 + "model.layers.{bid}.mlp.fc1", # phi2 + "model.layers.layers.{bid}.mlp.up_proj", # plamo + "model.layers.{bid}.feed_forward.w3", # internlm2 + "encoder.layers.{bid}.mlp.fc11", # nomic-bert + "model.layers.{bid}.mlp.c_fc", # starcoder2 ), + MODEL_TENSOR.FFN_UP_EXP: ( "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx ), + # AWQ-activation gate - MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), # mpt + MODEL_TENSOR.FFN_ACT: ( + "transformer.blocks.{bid}.ffn.act", # mpt + ), + # Feed-forward gate MODEL_TENSOR.FFN_GATE: ( - "model.layers.{bid}.mlp.gate_proj", # llama-hf refact - "layers.{bid}.feed_forward.w1", # llama-pth - "transformer.h.{bid}.mlp.w2", # qwen - "model.layers.layers.{bid}.mlp.gate_proj", # plamo - "model.layers.{bid}.feed_forward.w1", # internlm2 - "encoder.layers.{bid}.mlp.fc12", # nomic-bert + "model.layers.{bid}.mlp.gate_proj", # llama-hf refact + "layers.{bid}.feed_forward.w1", # llama-pth + "transformer.h.{bid}.mlp.w2", # qwen + "model.layers.layers.{bid}.mlp.gate_proj", # plamo + "model.layers.{bid}.feed_forward.w1", # internlm2 + "encoder.layers.{bid}.mlp.fc12", # nomic-bert ), + MODEL_TENSOR.FFN_GATE_EXP: ( "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) @@ -236,114 +262,86 @@ class TensorNameMap: ), # Feed-forward down MODEL_TENSOR.FFN_DOWN: ( - "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox - "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen - "transformer.blocks.{bid}.ffn.down_proj", # mpt - "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon - "h.{bid}.mlp.dense_4h_to_h", # bloom - "model.layers.{bid}.mlp.down_proj", # llama-hf - "layers.{bid}.feed_forward.w2", # llama-pth - "encoder.layer.{bid}.output.dense", # bert - "transformer.h.{bid}.mlp.fc_out", # gpt-j + "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox + "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen + "transformer.blocks.{bid}.ffn.down_proj", # mpt + "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon + "h.{bid}.mlp.dense_4h_to_h", # bloom + "model.layers.{bid}.mlp.down_proj", # llama-hf + "layers.{bid}.feed_forward.w2", # llama-pth + "encoder.layer.{bid}.output.dense", # bert + "transformer.h.{bid}.mlp.fc_out", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon - "model.layers.{bid}.mlp.dense_4h_to_h", # persimmon - "h.{bid}.mlp.c_proj", # gpt2 - "transformer.h.{bid}.mlp.fc2", # phi2 - "model.layers.{bid}.mlp.fc2", # phi2 - "model.layers.layers.{bid}.mlp.down_proj", # plamo - "model.layers.{bid}.feed_forward.w2", # internlm2 - "encoder.layers.{bid}.mlp.fc2", # nomic-bert - "model.layers.{bid}.mlp.c_proj", # starcoder2 + "model.layers.{bid}.mlp.dense_4h_to_h", # persimmon + "h.{bid}.mlp.c_proj", # gpt2 + "transformer.h.{bid}.mlp.fc2", # phi2 + "model.layers.{bid}.mlp.fc2", # phi2 + "model.layers.layers.{bid}.mlp.down_proj", # plamo + "model.layers.{bid}.feed_forward.w2", # internlm2 + "encoder.layers.{bid}.mlp.fc2", # nomic-bert + "model.layers.{bid}.mlp.c_proj", # starcoder2 ), + MODEL_TENSOR.FFN_DOWN_EXP: ( "layers.{bid}.feed_forward.experts.w2", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx ), + MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", - "model.layers.{bid}.self_attn.q_layernorm", # persimmon - "model.layers.{bid}.self_attn.q_layernorm.norms.0", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.1", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.2", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.3", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.4", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.5", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.6", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.7", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.8", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.9", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.10", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.11", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.12", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.13", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.14", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.15", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.16", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.17", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.18", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.19", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.20", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.21", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.22", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.23", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.24", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.25", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.26", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.27", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.28", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.29", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.30", # stablelm - "model.layers.{bid}.self_attn.q_layernorm.norms.31", # stablelm - "model.layers.{bid}.self_attn.q_norm", # cohere - "transformer.blocks.{bid}.attn.q_ln", # sea-lion + "model.layers.{bid}.self_attn.q_layernorm", # persimmon + "model.layers.{bid}.self_attn.q_norm", # cohere + "transformer.blocks.{bid}.attn.q_ln", # sea-lion ), + MODEL_TENSOR.ATTN_K_NORM: ( "language_model.encoder.layers.{bid}.self_attention.k_layernorm", - "model.layers.{bid}.self_attn.k_layernorm", # persimmon - "model.layers.{bid}.self_attn.k_layernorm.norms.0", # stablelm - "model.layers.{bid}.self_attn.k_layernorm.norms.1", # stablelm - "model.layers.{bid}.self_attn.k_layernorm.norms.2", # stablelm - "model.layers.{bid}.self_attn.k_layernorm.norms.3", # stablelm - "model.layers.{bid}.self_attn.k_layernorm.norms.4", # stablelm - "model.layers.{bid}.self_attn.k_layernorm.norms.5", # stablelm - "model.layers.{bid}.self_attn.k_layernorm.norms.6", # stablelm - "model.layers.{bid}.self_attn.k_layernorm.norms.7", # stablelm - "model.layers.{bid}.self_attn.k_norm", # cohere - "transformer.blocks.{bid}.attn.k_ln", # sea-lion + "model.layers.{bid}.self_attn.k_layernorm", # persimmon + "model.layers.{bid}.self_attn.k_norm", # cohere + "transformer.blocks.{bid}.attn.k_ln", # sea-lion ), + MODEL_TENSOR.ROPE_FREQS: ( "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon ), + MODEL_TENSOR.LAYER_OUT_NORM: ( - "encoder.layer.{bid}.output.LayerNorm", # bert - "encoder.layers.{bid}.norm2", # nomic-bert - "transformer.decoder_layer.{bid}.rms_norm_3", # Grok + "encoder.layer.{bid}.output.LayerNorm", # bert + "encoder.layers.{bid}.norm2", # nomic-bert + "transformer.decoder_layer.{bid}.rms_norm_3", # Grok ), + MODEL_TENSOR.SSM_IN: ( "model.layers.{bid}.in_proj", "backbone.layers.{bid}.mixer.in_proj", ), + MODEL_TENSOR.SSM_CONV1D: ( "model.layers.{bid}.conv1d", "backbone.layers.{bid}.mixer.conv1d", ), + MODEL_TENSOR.SSM_X: ( "model.layers.{bid}.x_proj", "backbone.layers.{bid}.mixer.x_proj", ), + MODEL_TENSOR.SSM_DT: ( "model.layers.{bid}.dt_proj", "backbone.layers.{bid}.mixer.dt_proj", ), + MODEL_TENSOR.SSM_A: ( "model.layers.{bid}.A_log", "backbone.layers.{bid}.mixer.A_log", ), + MODEL_TENSOR.SSM_D: ( "model.layers.{bid}.D", "backbone.layers.{bid}.mixer.D", ), + MODEL_TENSOR.SSM_OUT: ( "model.layers.{bid}.out_proj", "backbone.layers.{bid}.mixer.out_proj", @@ -368,35 +366,31 @@ def __init__(self, arch: MODEL_ARCH, n_blocks: int): # TODO: make this configurable n_experts = 8 for xid in range(n_experts): - tensor_name = TENSOR_NAMES[tensor].format(bid=bid, xid=xid) + tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid) self.mapping[tensor_name] = (tensor, tensor_name) for key in keys: - key = key.format(bid=bid, xid=xid) + key = key.format(bid = bid, xid = xid) self.mapping[key] = (tensor, tensor_name) - def get_type_and_name( - self, key: str, try_suffixes: Sequence[str] = () - ) -> tuple[MODEL_TENSOR, str] | None: + def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None: result = self.mapping.get(key) if result is not None: return result for suffix in try_suffixes: if key.endswith(suffix): - result = self.mapping.get(key[: -len(suffix)]) + result = self.mapping.get(key[:-len(suffix)]) if result is not None: return result[0], result[1] + suffix return None def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None: - result = self.get_type_and_name(key, try_suffixes=try_suffixes) + result = self.get_type_and_name(key, try_suffixes = try_suffixes) if result is None: return None return result[1] - def get_type( - self, key: str, try_suffixes: Sequence[str] = () - ) -> MODEL_TENSOR | None: - result = self.get_type_and_name(key, try_suffixes=try_suffixes) + def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None: + result = self.get_type_and_name(key, try_suffixes = try_suffixes) if result is None: return None return result[0] From 91a3db9e7d7c27b731388139cf532e13e2821161 Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Fri, 12 Apr 2024 23:27:07 -0700 Subject: [PATCH 07/25] Formatting --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index e5a9f709d7262..e5a85ad9805b6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -241,7 +241,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, { LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_STABLELM, "stablelm" }, - { LLM_ARCH_STABLELM, "stablelm2" }, + { LLM_ARCH_STABLELM2, "stablelm2" }, { LLM_ARCH_QWEN, "qwen" }, { LLM_ARCH_QWEN2, "qwen2" }, { LLM_ARCH_PHI2, "phi2" }, From 29d940b0d75a2ffa81aa946ccaecbd88515b9eb1 Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Sat, 13 Apr 2024 19:09:37 -0700 Subject: [PATCH 08/25] Do QK norm stacking in model conversion step --- convert-hf-to-gguf.py | 102 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 101 insertions(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index eb6fe0ea42940..07a8a8d3bbece 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1208,7 +1208,6 @@ def set_vocab(self): self._set_vocab_qwen() def set_gguf_parameters(self): - super().set_gguf_parameters() hparams = self.hparams block_count = hparams["num_hidden_layers"] @@ -1224,6 +1223,107 @@ def set_gguf_parameters(self): self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + n_head = self.hparams.get("num_attention_heads") + n_kv_head = self.hparams.get("num_key_value_heads") + q_norms = dict() + k_norms = dict() + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + n_dims = len(data.shape) + if name.find("q_layernorm.norms") != -1: + q_norms[name] = data + if len(q_norms) >= (block_count * n_head): + for bid in range(block_count): + datas = [] + for xid in range(n_head): + ename = f"model.layers.{bid}.self_attn.q_layernorm.norms.{xid}.weight" + datas.append(q_norms[ename]) + del q_norms[ename] + data = np.stack(datas, axis=0) + data_dtype = data.dtype + merged_name = f"model.layers.{bid}.self_attn.q_layernorm.weight" + new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) + if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")): + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2: + data = data.astype(np.float16) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + continue + if name.find("k_layernorm.norms") != -1: + k_norms[name] = data + if len(k_norms) >= (block_count * n_kv_head): + for bid in range(block_count): + full = True + datas = [] + for xid in range(n_kv_head): + ename = f"model.layers.{bid}.self_attn.k_layernorm.norms.{xid}.weight" + datas.append(k_norms[ename]) + del k_norms[ename] + data = np.stack(datas, axis=0) + data_dtype = data.dtype + merged_name = f"model.layers.{bid}.self_attn.k_layernorm.weight" + new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) + if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")): + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2: + data = data.astype(np.float16) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + continue + + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")): + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) @Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") class LlamaModel(Model): From 0ec53cfff77f01be078e77e682716e04ba4d610f Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Sat, 13 Apr 2024 21:12:30 -0700 Subject: [PATCH 09/25] Converge StableLM and StableLM2 code to simplify graph construction --- convert-hf-to-gguf.py | 14 --- gguf-py/gguf/constants.py | 16 --- llama.cpp | 249 +++++++++----------------------------- 3 files changed, 55 insertions(+), 224 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 07a8a8d3bbece..42cffab778e6d 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1186,20 +1186,6 @@ def write_tensors(self): @Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") class StableLMModel(Model): model_arch = gguf.MODEL_ARCH.STABLELM - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.model_arch = ( - gguf.MODEL_ARCH.STABLELM - if self.hparams["num_hidden_layers"] < 40 - else gguf.MODEL_ARCH.STABLELM2 - ) - self.gguf_writer = gguf.GGUFWriter( - self.fname_out, - gguf.MODEL_ARCH_NAMES[self.model_arch], - endianess=self.endianess, - use_temp_file=False, - ) - def set_vocab(self): if (self.dir_model / "tokenizer.json").is_file(): self._set_vocab_gpt2() diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index bff325c690130..9da13dfc91236 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -113,7 +113,6 @@ class MODEL_ARCH(IntEnum): NOMIC_BERT = auto() BLOOM = auto() STABLELM = auto() - STABLELM2 = auto() QWEN = auto() QWEN2 = auto() PHI2 = auto() @@ -184,7 +183,6 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.NOMIC_BERT: "nomic-bert", MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.STABLELM: "stablelm", - MODEL_ARCH.STABLELM2: "stablelm2", MODEL_ARCH.QWEN: "qwen", MODEL_ARCH.QWEN2: "qwen2", MODEL_ARCH.PHI2: "phi2", @@ -441,20 +439,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.STABLELM2: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, MODEL_TENSOR.ATTN_Q_NORM, MODEL_TENSOR.ATTN_K_NORM, ], diff --git a/llama.cpp b/llama.cpp index e5a85ad9805b6..25c2b6bd67c21 100644 --- a/llama.cpp +++ b/llama.cpp @@ -207,7 +207,6 @@ enum llm_arch { LLM_ARCH_NOMIC_BERT, LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, - LLM_ARCH_STABLELM2, LLM_ARCH_QWEN, LLM_ARCH_QWEN2, LLM_ARCH_PHI2, @@ -241,7 +240,6 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, { LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_STABLELM, "stablelm" }, - { LLM_ARCH_STABLELM2, "stablelm2" }, { LLM_ARCH_QWEN, "qwen" }, { LLM_ARCH_QWEN2, "qwen2" }, { LLM_ARCH_PHI2, "phi2" }, @@ -704,23 +702,6 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_STABLELM2, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, }, @@ -3876,14 +3857,6 @@ static void llm_load_hparams( switch (hparams.n_layer) { case 24: model.type = e_model::MODEL_1B; break; case 32: model.type = e_model::MODEL_3B; break; - default: model.type = e_model::MODEL_UNKNOWN; - } - } break; - case LLM_ARCH_STABLELM2: - { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - - switch (hparams.n_layer) { case 40: model.type = e_model::MODEL_12B; break; default: model.type = e_model::MODEL_UNKNOWN; } @@ -5079,42 +5052,16 @@ static bool llm_load_tensors( layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false); layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false); - layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + if (n_layer >= 40) { + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM,"weight", i), {hparams.n_embd_head_k, hparams.n_head}); + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM,"weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}); - layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); - layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); - layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); - } - } break; - case LLM_ARCH_STABLELM2: - { - model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - - // output - { - model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); - model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); - } - - for (int i = 0; i < n_layer; ++i) { - ggml_context * ctx_layer = ctx_for_layer(i); - ggml_context * ctx_split = ctx_for_layer_split(i); - - auto & layer = model.layers[i]; - - layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - - layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); - layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); - layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); - layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - - layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM,"weight", i), {hparams.n_embd_head_k, hparams.n_head}); - layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM,"weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}); + } + if (n_layer < 40) { + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + } layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); @@ -8121,6 +8068,8 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; + struct ggml_tensor * ffn_inp; + struct ggml_tensor * attn_out = cur; inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); @@ -8139,127 +8088,10 @@ struct llm_build_context { model.layers[il].attn_norm_b, LLM_NORM, cb, il); cb(cur, "attn_norm", il); - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } - - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } - - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } - - Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + if(n_layer >= 40) { + ffn_inp = cur; } - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, - model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, - model.output_norm_b, - LLM_NORM, cb, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_stablelm2() { - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); - - for (int il = 0; il < n_layer; ++il) { - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM, cb, il); - cb(cur, "attn_norm", il); - struct ggml_tensor * ffn_inp = cur; - // self-attention { // compute Q and K and RoPE them @@ -8283,7 +8115,6 @@ struct llm_build_context { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); } - if (model.layers[il].attn_q_norm) { Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens, ggml_element_size(Qcur) * n_embd_head, @@ -8317,6 +8148,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); + Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, @@ -8329,31 +8161,64 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } - if (il == n_layer - 1) { + if (il == n_layer - 1 && n_layer < 40) { // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + else if (il == n_layer - 1 && n_layer >= 40){ struct ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); } - struct ggml_tensor * attn_out = cur; + if (n_layer < 40) { + ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + } + else { + attn_out = cur; + } // feed-forward network { - cur = llm_build_ffn(ctx0, ffn_inp, + if (n_layer < 40) { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + else { + cur = llm_build_ffn(ctx0, ffn_inp, model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); + cb(cur, "ffn_out", il); + } } - // add together residual + FFN + self-attention - cur = ggml_add(ctx0, cur, inpL); - cur = ggml_add(ctx0, cur, attn_out); - cb(cur, "l_out", il); + if (n_layer < 40) { + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + } + else { + // add together residual + FFN + self-attention + cur = ggml_add(ctx0, cur, inpL); + cur = ggml_add(ctx0, cur, attn_out); + cb(cur, "l_out", il); + } // input for next layer inpL = cur; @@ -8375,6 +8240,7 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_qwen() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -10078,10 +9944,6 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_stablelm(); } break; - case LLM_ARCH_STABLELM2: - { - result = llm.build_stablelm2(); - } break; case LLM_ARCH_QWEN: { result = llm.build_qwen(); @@ -15000,7 +14862,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_STABLELM: - case LLM_ARCH_STABLELM2: case LLM_ARCH_QWEN: case LLM_ARCH_QWEN2: case LLM_ARCH_PHI2: From 8dcd9978d268345e672ec02e19d9d08e18a7377b Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Sat, 13 Apr 2024 21:15:15 -0700 Subject: [PATCH 10/25] Fix accidental removal --- gguf-py/gguf/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 9da13dfc91236..e7ccbe6c2949b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -196,6 +196,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.COMMAND_R: "command-r", + MODEL_ARCH.DBRX: "dbrx", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { From 0dc779bff901d5230cb2706cf7076a5c1869cfe7 Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Sat, 13 Apr 2024 21:27:18 -0700 Subject: [PATCH 11/25] Removed warnings --- llama.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index 25c2b6bd67c21..1b92da627cf49 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8088,9 +8088,7 @@ struct llm_build_context { model.layers[il].attn_norm_b, LLM_NORM, cb, il); cb(cur, "attn_norm", il); - if(n_layer >= 40) { - ffn_inp = cur; - } + ffn_inp = cur; // self-attention { @@ -8178,9 +8176,7 @@ struct llm_build_context { ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); } - else { - attn_out = cur; - } + attn_out = cur; // feed-forward network { From f7b40d76509e499f6992b99c2fabf28e58419fef Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Sat, 13 Apr 2024 23:37:46 -0700 Subject: [PATCH 12/25] Revert formatter --- gguf-py/gguf/constants.py | 46 +++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e7ccbe6c2949b..75f0b236d6bbe 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -98,30 +98,30 @@ class Tokenizer: class MODEL_ARCH(IntEnum): - LLAMA = auto() - FALCON = auto() - BAICHUAN = auto() - GROK = auto() - GPT2 = auto() - GPTJ = auto() - GPTNEOX = auto() - MPT = auto() - STARCODER = auto() - PERSIMMON = auto() - REFACT = auto() - BERT = auto() + LLAMA = auto() + FALCON = auto() + BAICHUAN = auto() + GROK = auto() + GPT2 = auto() + GPTJ = auto() + GPTNEOX = auto() + MPT = auto() + STARCODER = auto() + PERSIMMON = auto() + REFACT = auto() + BERT = auto() NOMIC_BERT = auto() - BLOOM = auto() - STABLELM = auto() - QWEN = auto() - QWEN2 = auto() - PHI2 = auto() - PLAMO = auto() - CODESHELL = auto() - ORION = auto() - INTERNLM2 = auto() - MINICPM = auto() - GEMMA = auto() + BLOOM = auto() + STABLELM = auto() + QWEN = auto() + QWEN2 = auto() + PHI2 = auto() + PLAMO = auto() + CODESHELL = auto() + ORION = auto() + INTERNLM2 = auto() + MINICPM = auto() + GEMMA = auto() STARCODER2 = auto() MAMBA = auto() XVERSE = auto() From e3f73604d5b80ad9fb9693d1d890e0186dccdee1 Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Sat, 13 Apr 2024 23:56:32 -0700 Subject: [PATCH 13/25] Move QK norm stack to private function so it's easier to read --- convert-hf-to-gguf.py | 74 +++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 48 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 42cffab778e6d..4b03adad54767 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1232,60 +1232,14 @@ def write_tensors(self): if name.find("q_layernorm.norms") != -1: q_norms[name] = data if len(q_norms) >= (block_count * n_head): - for bid in range(block_count): - datas = [] - for xid in range(n_head): - ename = f"model.layers.{bid}.self_attn.q_layernorm.norms.{xid}.weight" - datas.append(q_norms[ename]) - del q_norms[ename] - data = np.stack(datas, axis=0) - data_dtype = data.dtype - merged_name = f"model.layers.{bid}.self_attn.q_layernorm.weight" - new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) - if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")): - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2: - data = data.astype(np.float16) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) + self._stack_qk_norm(block_count, name, tensor_map, n_head, q_norms, n_dims, layer_name="q_layernorm") continue if name.find("k_layernorm.norms") != -1: k_norms[name] = data if len(k_norms) >= (block_count * n_kv_head): - for bid in range(block_count): - full = True - datas = [] - for xid in range(n_kv_head): - ename = f"model.layers.{bid}.self_attn.k_layernorm.norms.{xid}.weight" - datas.append(k_norms[ename]) - del k_norms[ename] - data = np.stack(datas, axis=0) - data_dtype = data.dtype - merged_name = f"model.layers.{bid}.self_attn.k_layernorm.weight" - new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) - if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")): - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2: - data = data.astype(np.float16) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) + self._stack_qk_norm(block_count, name, tensor_map, n_kv_head, k_norms, n_dims, layer_name="k_layernorm") continue - # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: @@ -1309,6 +1263,30 @@ def write_tensors(self): print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, layer_name="q_layernorm"): + for bid in range(block_count): + datas = [] + for xid in range(n_head): + ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight" + datas.append(norms[ename]) + del norms[ename] + data = np.stack(datas, axis=0) + data_dtype = data.dtype + merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" + new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) + if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")): + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2: + data = data.astype(np.float16) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) @Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") From 96695fb96bcb74ad8bd2b66a761cca8f53079d2c Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Sun, 14 Apr 2024 14:21:25 -0700 Subject: [PATCH 14/25] refactor stablelm graph builder to support 1.6, 3b and 12b more efficiently --- llama.cpp | 104 ++++++++++++++++++++++-------------------------------- 1 file changed, 42 insertions(+), 62 deletions(-) diff --git a/llama.cpp b/llama.cpp index 1b92da627cf49..59e573b34044e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3567,6 +3567,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_3B: return "3B"; case MODEL_7B: return "7B"; case MODEL_8B: return "8B"; + case MODEL_12B: return "12B"; case MODEL_13B: return "13B"; case MODEL_14B: return "14B"; case MODEL_15B: return "15B"; @@ -5052,16 +5053,14 @@ static bool llm_load_tensors( layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false); layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false); - if (n_layer >= 40) { - layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM,"weight", i), {hparams.n_embd_head_k, hparams.n_head}); - layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM,"weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}); + // optional q and k layernorms, present in StableLM 2 12B + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM,"weight", i), {hparams.n_embd_head_k, hparams.n_head}, false); + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM,"weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false); - } + // optional FFN norm, not present in StableLM 2 12B which uses parallel residual + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false); - if (n_layer < 40) { - layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - } layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); @@ -8068,8 +8067,6 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - struct ggml_tensor * ffn_inp; - struct ggml_tensor * attn_out = cur; inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); @@ -8080,7 +8077,7 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + // norm cur = llm_build_norm(ctx0, inpL, hparams, @@ -8088,7 +8085,8 @@ struct llm_build_context { model.layers[il].attn_norm_b, LLM_NORM, cb, il); cb(cur, "attn_norm", il); - ffn_inp = cur; + + struct ggml_tensor * inpSA = cur; // self-attention { @@ -8113,25 +8111,20 @@ struct llm_build_context { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); } - if (model.layers[il].attn_q_norm) { - Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens, - ggml_element_size(Qcur) * n_embd_head, - ggml_element_size(Qcur) * n_embd_head * n_head, - 0); - cb(Qcur, "Qcur", il); - Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens, - ggml_element_size(Kcur) * n_embd_head, - ggml_element_size(Kcur) * n_embd_head * n_head_kv, - 0); - cb(Kcur, "Kcur", il); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + cb(Qcur, "Qcur", il); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + cb(Kcur, "Kcur", il); + if (model.layers[il].attn_q_norm) { Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, LLM_NORM, cb, il); cb(Qcur, "Qcur", il); - + } + if (model.layers[il].attn_q_norm) { Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, @@ -8141,14 +8134,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -8159,63 +8152,50 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } - if (il == n_layer - 1 && n_layer < 40) { + if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - else if (il == n_layer - 1 && n_layer >= 40){ - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); - } - - if (n_layer < 40) { - ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - } - attn_out = cur; + struct ggml_tensor * attn_out = cur; + // only used for non-parallel residual + struct ggml_tensor * ffn_inp = ggml_add(ctx0, attn_out, inpL); + cb(cur, "ffn_inp", il); // feed-forward network { - if (n_layer < 40) { + if (model.layers[il].ffn_norm) { cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - else { - cur = llm_build_ffn(ctx0, ffn_inp, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } + } else { + // parallel residual + cur = inpSA; + } + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); } - if (n_layer < 40) { + if (model.layers[il].ffn_norm) { + // non-parallel residual cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "l_out", il); - } - else { + } else { // add together residual + FFN + self-attention cur = ggml_add(ctx0, cur, inpL); cur = ggml_add(ctx0, cur, attn_out); - cb(cur, "l_out", il); } + cb(cur, "l_out", il); + // input for next layer inpL = cur; } From 13c75c21eb5eb0785007b5f3b7be3d1a5a61af14 Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Sun, 14 Apr 2024 14:28:12 -0700 Subject: [PATCH 15/25] Proper check for None type for new_name to avoid crash; formatting; revert change to base class `write_tensors()` --- convert-hf-to-gguf.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 4b03adad54767..3a73cce971fa5 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -164,7 +164,7 @@ def write_tensors(self): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2: + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") @@ -1264,6 +1264,7 @@ def write_tensors(self): print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) + def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, layer_name="q_layernorm"): for bid in range(block_count): datas = [] @@ -1275,15 +1276,15 @@ def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, l data_dtype = data.dtype merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")): data = data.astype(np.float32) # if f16 desired, convert any float32 2-dim weight tensors to float16 if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2: data = data.astype(np.float16) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") From 412a2807cb71766a48edf0e9ecfaec68fe96796a Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Sun, 14 Apr 2024 14:30:33 -0700 Subject: [PATCH 16/25] Format --- gguf-py/gguf/tensor_mapping.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 81ed5ccba9188..96e2508a756f3 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -173,6 +173,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx ), + # Attention output norm MODEL_TENSOR.ATTN_OUT_NORM: ( "encoder.layer.{bid}.attention.output.LayerNorm", # bert @@ -260,6 +261,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx ), + # Feed-forward down MODEL_TENSOR.FFN_DOWN: ( "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox From 91728faac6ec9e02e227ff3b61689ee77504caa1 Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Sun, 14 Apr 2024 14:40:23 -0700 Subject: [PATCH 17/25] Formatting --- gguf-py/gguf/tensor_mapping.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 96e2508a756f3..ec6fcbb838425 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -172,7 +172,7 @@ class TensorNameMap: "encoder.layers.{bid}.attn.out_proj", # nomic-bert "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx - ), + ), # Attention output norm MODEL_TENSOR.ATTN_OUT_NORM: ( @@ -260,7 +260,7 @@ class TensorNameMap: "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx - ), + ), # Feed-forward down MODEL_TENSOR.FFN_DOWN: ( From bf1a9a551442c6610118728d543a10872ebf1748 Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Sun, 14 Apr 2024 16:24:00 -0700 Subject: [PATCH 18/25] format Co-authored-by: compilade --- llama.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llama.cpp b/llama.cpp index 59e573b34044e..097d0b410eee0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8177,11 +8177,11 @@ struct llm_build_context { cur = inpSA; } cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } From d2ab69306629543b7a5913810cb66617d3726a11 Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Sun, 14 Apr 2024 16:25:55 -0700 Subject: [PATCH 19/25] Fix incorrect check for K norm --- llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 097d0b410eee0..fa804cc604c1d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8124,7 +8124,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(Qcur, "Qcur", il); } - if (model.layers[il].attn_q_norm) { + if (model.layers[il].attn_k_norm) { Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, @@ -8175,7 +8175,7 @@ struct llm_build_context { } else { // parallel residual cur = inpSA; - } + } cur = llm_build_ffn(ctx0, cur, model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, From b7f984a0df432bbe16475e3175379df7bb6196ff Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Sun, 14 Apr 2024 16:34:40 -0700 Subject: [PATCH 20/25] space after commas; Keep indentation multiple of 4 spaces --- llama.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/llama.cpp b/llama.cpp index fa804cc604c1d..f2fd97d00840a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5054,8 +5054,8 @@ static bool llm_load_tensors( layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false); // optional q and k layernorms, present in StableLM 2 12B - layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM,"weight", i), {hparams.n_embd_head_k, hparams.n_head}, false); - layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM,"weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false); + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false); + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false); // optional FFN norm, not present in StableLM 2 12B which uses parallel residual layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false); @@ -8118,18 +8118,18 @@ struct llm_build_context { cb(Kcur, "Kcur", il); if (model.layers[il].attn_q_norm) { - Qcur = llm_build_norm(ctx0, Qcur, hparams, - model.layers[il].attn_q_norm, - NULL, - LLM_NORM, cb, il); - cb(Qcur, "Qcur", il); + Qcur = llm_build_norm(ctx0, Qcur, hparams, + model.layers[il].attn_q_norm, + NULL, + LLM_NORM, cb, il); + cb(Qcur, "Qcur", il); } if (model.layers[il].attn_k_norm) { - Kcur = llm_build_norm(ctx0, Kcur, hparams, - model.layers[il].attn_k_norm, - NULL, - LLM_NORM, cb, il); - cb(Kcur, "Kcur", il); + Kcur = llm_build_norm(ctx0, Kcur, hparams, + model.layers[il].attn_k_norm, + NULL, + LLM_NORM, cb, il); + cb(Kcur, "Kcur", il); } From 1f6929e5571ef9bdf4947382740d03d5b533d971 Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Sun, 14 Apr 2024 16:46:26 -0700 Subject: [PATCH 21/25] Flake8 format --- convert-hf-to-gguf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 3a73cce971fa5..120918369f658 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1186,6 +1186,7 @@ def write_tensors(self): @Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") class StableLMModel(Model): model_arch = gguf.MODEL_ARCH.STABLELM + def set_vocab(self): if (self.dir_model / "tokenizer.json").is_file(): self._set_vocab_gpt2() @@ -1290,6 +1291,7 @@ def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, l self.gguf_writer.add_tensor(new_name, data) + @Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA From 8a15f932af45f9532bbec48dac7171677cf5616a Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Tue, 16 Apr 2024 06:03:47 -0700 Subject: [PATCH 22/25] Removed unnecessary conditional branches --- llama.cpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/llama.cpp b/llama.cpp index f2fd97d00840a..45da226b11430 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8185,15 +8185,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); } - if (model.layers[il].ffn_norm) { - // non-parallel residual - cur = ggml_add(ctx0, cur, ffn_inp); - } else { - // add together residual + FFN + self-attention - cur = ggml_add(ctx0, cur, inpL); - cur = ggml_add(ctx0, cur, attn_out); - } - + cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "l_out", il); // input for next layer From 6ae4dad0041a39d5aad17268d668aa3b47c02e6c Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Tue, 16 Apr 2024 06:11:47 -0700 Subject: [PATCH 23/25] Removed unused comment --- llama.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 45da226b11430..650c043ef9752 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8160,7 +8160,6 @@ struct llm_build_context { inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } struct ggml_tensor * attn_out = cur; - // only used for non-parallel residual struct ggml_tensor * ffn_inp = ggml_add(ctx0, attn_out, inpL); cb(cur, "ffn_inp", il); From e6ec2033366389c0676a95977666b291d8c49d1a Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Tue, 16 Apr 2024 07:11:58 -0700 Subject: [PATCH 24/25] Fixed incorrect tensor passing --- llama.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 650c043ef9752..77b5380622251 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8159,9 +8159,8 @@ struct llm_build_context { inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * attn_out = cur; - struct ggml_tensor * ffn_inp = ggml_add(ctx0, attn_out, inpL); - cb(cur, "ffn_inp", il); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); // feed-forward network { From 94e8c490fe210212bc314f01167881a74575f04a Mon Sep 17 00:00:00 2001 From: Ashish <1856117+ashishdatta@users.noreply.github.com> Date: Tue, 16 Apr 2024 07:57:36 -0700 Subject: [PATCH 25/25] Format --- llama.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 77b5380622251..5e4fb0c3378d6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8159,7 +8159,8 @@ struct llm_build_context { inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); // feed-forward network