diff --git a/convert-ggml-to-pth.py b/convert-ggml-to-pth.py index 20158c9ca8650..8cc67785140f3 100644 --- a/convert-ggml-to-pth.py +++ b/convert-ggml-to-pth.py @@ -72,7 +72,12 @@ def dequantize_weights(fin, n_rows, n_cols): def read_variables(fin): model = {} - pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables") + pbar = tqdm( + total=os.path.getsize(fin.name), + unit="B", + unit_scale=True, + desc="Reading variables", + ) while True: start_pos = fin.tell() try: @@ -98,7 +103,9 @@ def read_variables(fin): data_size = np.prod(shape) data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape) - model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16) + model[name] = torch.tensor( + data, dtype=torch.float32 if dtype == np.float32 else torch.float16 + ) pbar.update(fin.tell() - start_pos) @@ -112,11 +119,17 @@ def convert_to_hf_format(model, hparams): dim = hparams["dim"] dims_per_head = dim // n_heads base = 10000.0 - inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) + inv_freq = 1.0 / ( + base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head) + ) # permute for sliced rotary def permute(w): - return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim) + return ( + w.view(n_heads, dim // n_heads // 2, 2, dim) + .transpose(1, 2) + .reshape(dim, dim) + ) state_dict = {} for layer_i in range(n_layers): @@ -164,16 +177,22 @@ def permute(w): def chat(model, hparams, llama_dir): - from transformers import (GenerationConfig, LlamaForCausalLM, - LlamaTokenizer, StoppingCriteria, - StoppingCriteriaList) + from transformers import ( + GenerationConfig, + LlamaForCausalLM, + LlamaTokenizer, + StoppingCriteria, + StoppingCriteriaList, + ) from transformers.models.llama.configuration_llama import LlamaConfig class StoppingCriteriaSub(StoppingCriteria): def __init__(self): super().__init__() - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]): + def __call__( + self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[] + ): print(tokenizer.decode(input_ids[0]), end="", flush=True) if input_ids[0][-1] == 13: return True @@ -237,7 +256,11 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops def main(): parser = argparse.ArgumentParser() parser.add_argument( - "--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files." + "--input_dir", + "-i", + type=str, + required=True, + help="The input directory containing the ggml files.", ) parser.add_argument( "--prefix", @@ -252,14 +275,21 @@ def main(): help="Whether to save the model in the huggingface format. (default: False)", ) parser.add_argument( - "--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)" + "--chat", + "-c", + action="store_true", + help="Whether to open a chat with the model. (default: False)", ) args = parser.parse_args() llama_dir = os.path.abspath(f"{args.input_dir}/../") ggml_files = sorted( - [f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)] + [ + f"{args.input_dir}/{f}" + for f in os.listdir(args.input_dir) + if f.startswith(args.prefix) + ] ) fin = open(ggml_files[0], "rb") diff --git a/convert-gpt4all-to-ggml.py b/convert-gpt4all-to-ggml.py index f1d9d7aefe3e0..555ab67d015df 100644 --- a/convert-gpt4all-to-ggml.py +++ b/convert-gpt4all-to-ggml.py @@ -15,37 +15,43 @@ HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"] + def parse_args(): - parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format') - parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin') - parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file') + parser = argparse.ArgumentParser( + description="Upgrade a GPT4All model to the current format" + ) + parser.add_argument("gpt4all_model", help="path to gpt4all-lora-quantized.bin") + parser.add_argument("tokenizer_model", help="path to LLaMA tokenizer.model file") return parser.parse_args() + def read_header(f_in): struct_fmt = "i" * (3 + len(HPARAMS)) struct_size = struct.calcsize(struct_fmt) buf = f_in.read(struct_size) return struct.unpack(struct_fmt, buf) + def write_header(f_out, header): (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header - if magic != 0x67676d6c: - raise Exception('Invalid file magic. Must be an old style ggml file.') + if magic != 0x67676D6C: + raise Exception("Invalid file magic. Must be an old style ggml file.") values = [ - 0x67676d66, # magic: ggml in hex - 1, # file version + 0x67676D66, # magic: ggml in hex + 1, # file version vocab_size, dim, multiple_of, n_heads, n_layers, rot, - ftype + ftype, ] f_out.write(struct.pack("i" * len(values), *values)) + def write_tokens(fout, tokenizer): for i in range(tokenizer.vocab_size()): if tokenizer.is_unknown(i): @@ -71,12 +77,14 @@ def write_tokens(fout, tokenizer): fout.write(text) fout.write(struct.pack("f", 0.0)) + def read_tokens(f_in, tokenizer): for i in range(tokenizer.vocab_size()): len_b = f_in.read(4) (length,) = struct.unpack("i", len_b) f_in.read(length) + def copy_all_data(f_out, f_in): while True: buf = f_in.read(1024 * 1024) @@ -84,9 +92,10 @@ def copy_all_data(f_out, f_in): break f_out.write(buf) + def convert_one_file(path_in, tokenizer): path_tmp = f"{path_in}.tmp" - path_orig= f"{path_in}.orig" + path_orig = f"{path_in}.orig" print(f"converting {path_in}") with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out: write_header(f_out, read_header(f_in)) @@ -96,6 +105,7 @@ def convert_one_file(path_in, tokenizer): os.rename(path_in, path_orig) os.rename(path_tmp, path_in) + def main(): args = parse_args() @@ -103,5 +113,6 @@ def main(): convert_one_file(args.gpt4all_model, tokenizer) + if __name__ == "__main__": main() diff --git a/convert-gptq-to-ggml.py b/convert-gptq-to-ggml.py index 6c77808fcd186..1cc4e2fd93fa0 100644 --- a/convert-gptq-to-ggml.py +++ b/convert-gptq-to-ggml.py @@ -20,9 +20,12 @@ model = torch.load(fname_model, map_location="cpu") -n_vocab, n_embd = model['model.embed_tokens.weight'].shape -n_layer = 1 + max(int(m.group(1)) for name in model - if (m := re.match(r'model\.layers\.([0-9]+)', name))) +n_vocab, n_embd = model["model.embed_tokens.weight"].shape +n_layer = 1 + max( + int(m.group(1)) + for name in model + if (m := re.match(r"model\.layers\.([0-9]+)", name)) +) # hardcoded: n_mult = 256 @@ -36,14 +39,14 @@ fout = open(fname_out, "wb") -fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex -fout.write(struct.pack("i", 1)) # file version +fout.write(struct.pack("i", 0x67676D66)) # magic: ggmf in hex +fout.write(struct.pack("i", 1)) # file version fout.write(struct.pack("i", n_vocab)) fout.write(struct.pack("i", n_embd)) fout.write(struct.pack("i", n_mult)) fout.write(struct.pack("i", n_head)) fout.write(struct.pack("i", n_layer)) -fout.write(struct.pack("i", n_embd // n_head)) # rot (obsolete) +fout.write(struct.pack("i", n_embd // n_head)) # rot (obsolete) fout.write(struct.pack("i", 4)) @@ -66,16 +69,23 @@ fout.write(text) fout.write(struct.pack("f", tokenizer.get_score(i))) + def write_header(shape, dst_name, ftype_cur): - sname = dst_name.encode('utf-8') + sname = dst_name.encode("utf-8") fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur)) fout.write(struct.pack("i" * len(shape), *shape[::-1])) fout.write(sname) + def convert_non_q4(src_name, dst_name): v = model[src_name] shape = v.shape - print("Processing non-Q4 variable: " + src_name + " with shape: ", shape, " and type: ", v.dtype) + print( + "Processing non-Q4 variable: " + src_name + " with shape: ", + shape, + " and type: ", + v.dtype, + ) if len(shape) == 1: print(" Converting to float32") v = v.to(torch.float32) @@ -88,11 +98,12 @@ def convert_non_q4(src_name, dst_name): # data v.numpy().tofile(fout) + def convert_q4(src_name, dst_name, permute=False): zeros = model[f"{src_name}.zeros"].numpy() scales = model[f"{src_name}.scales"].numpy() bias = model[f"{src_name}.bias"].numpy() - qweight = model[f"{src_name}.qweight"].numpy().T # transpose + qweight = model[f"{src_name}.qweight"].numpy().T # transpose # Q4_1 does not support bias; good thing the bias is always all zeros. assert not np.any(bias) @@ -113,7 +124,7 @@ def convert_q4(src_name, dst_name, permute=False): # the columns in a row, so we end up wasting quite a bit of memory with # repeated scales and addends. - addends = -zeros # flip sign + addends = -zeros # flip sign # Since the output format is mixed between integers and floats, we have # to hackily view the floats as int32s just so numpy will let us @@ -128,37 +139,53 @@ def convert_q4(src_name, dst_name, permute=False): addends_rep = np.atleast_3d(addends_view).repeat(grouped.shape[1], axis=1) scales_rep = np.atleast_3d(scales_view).repeat(grouped.shape[1], axis=1) - blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting='no') + blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting="no") if permute: # Permute some rows to undo the permutation done by convert_llama_weights_to_hf.py. # This can be done after the above conversion because it doesn't affect column order/layout. - blob = (blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:]) - .swapaxes(1, 2) - .reshape(blob.shape)) + blob = ( + blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:]) + .swapaxes(1, 2) + .reshape(blob.shape) + ) # header - write_header(shape, dst_name, 3) # ftype = Q4_1 + write_header(shape, dst_name, 3) # ftype = Q4_1 # data blob.tofile(fout) + convert_non_q4("model.embed_tokens.weight", "tok_embeddings.weight") convert_non_q4("model.norm.weight", "norm.weight") convert_non_q4("lm_head.weight", "output.weight") for i in range(n_layer): - convert_q4(f"model.layers.{i}.self_attn.q_proj", f"layers.{i}.attention.wq.weight", permute=True) - convert_q4(f"model.layers.{i}.self_attn.k_proj", f"layers.{i}.attention.wk.weight", permute=True) + convert_q4( + f"model.layers.{i}.self_attn.q_proj", + f"layers.{i}.attention.wq.weight", + permute=True, + ) + convert_q4( + f"model.layers.{i}.self_attn.k_proj", + f"layers.{i}.attention.wk.weight", + permute=True, + ) convert_q4(f"model.layers.{i}.self_attn.v_proj", f"layers.{i}.attention.wv.weight") convert_q4(f"model.layers.{i}.self_attn.o_proj", f"layers.{i}.attention.wo.weight") convert_q4(f"model.layers.{i}.mlp.gate_proj", f"layers.{i}.feed_forward.w1.weight") convert_q4(f"model.layers.{i}.mlp.down_proj", f"layers.{i}.feed_forward.w2.weight") - convert_q4(f"model.layers.{i}.mlp.up_proj", f"layers.{i}.feed_forward.w3.weight") - - convert_non_q4(f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight") - convert_non_q4(f"model.layers.{i}.post_attention_layernorm.weight", f"layers.{i}.ffn_norm.weight") + convert_q4(f"model.layers.{i}.mlp.up_proj", f"layers.{i}.feed_forward.w3.weight") + + convert_non_q4( + f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight" + ) + convert_non_q4( + f"model.layers.{i}.post_attention_layernorm.weight", + f"layers.{i}.ffn_norm.weight", + ) fout.close() diff --git a/convert-unversioned-ggml-to-ggml.py b/convert-unversioned-ggml-to-ggml.py index 33b6243bd94e0..1c1b133746d18 100644 --- a/convert-unversioned-ggml-to-ggml.py +++ b/convert-unversioned-ggml-to-ggml.py @@ -10,37 +10,43 @@ HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"] + def parse_args(): - parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format') - parser.add_argument('dir_model', help='directory containing ggml .bin files') - parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file') + parser = argparse.ArgumentParser( + description="Upgrade old ggml model files to the current format" + ) + parser.add_argument("dir_model", help="directory containing ggml .bin files") + parser.add_argument("tokenizer_model", help="path to LLaMA tokenizer.model file") return parser.parse_args() + def read_header(f_in): struct_fmt = "i" * (3 + len(HPARAMS)) struct_size = struct.calcsize(struct_fmt) buf = f_in.read(struct_size) return struct.unpack(struct_fmt, buf) + def write_header(f_out, header): (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header - if magic != 0x67676d6c: - raise Exception('Invalid file magic. Must be an old style ggml file.') + if magic != 0x67676D6C: + raise Exception("Invalid file magic. Must be an old style ggml file.") values = [ - 0x67676d66, # magic: ggml in hex - 1, # file version + 0x67676D66, # magic: ggml in hex + 1, # file version vocab_size, dim, multiple_of, n_heads, n_layers, rot, - ftype + ftype, ] f_out.write(struct.pack("i" * len(values), *values)) + def write_tokens(fout, tokenizer): for i in range(tokenizer.vocab_size()): if tokenizer.is_unknown(i): @@ -60,12 +66,14 @@ def write_tokens(fout, tokenizer): fout.write(text) fout.write(struct.pack("f", tokenizer.get_score(i))) + def read_tokens(f_in, tokenizer): for i in range(tokenizer.vocab_size()): len_b = f_in.read(4) (length,) = struct.unpack("i", len_b) f_in.read(length) + def copy_all_data(f_out, f_in): while True: buf = f_in.read(1024 * 1024) @@ -73,9 +81,10 @@ def copy_all_data(f_out, f_in): break f_out.write(buf) + def convert_one_file(path_in, tokenizer): path_tmp = f"{path_in}.tmp" - path_orig= f"{path_in}.orig" + path_orig = f"{path_in}.orig" print(f"converting {path_in}") with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out: write_header(f_out, read_header(f_in)) @@ -85,6 +94,7 @@ def convert_one_file(path_in, tokenizer): os.rename(path_in, path_orig) os.rename(path_tmp, path_in) + def main(): args = parse_args() files = [] @@ -96,5 +106,6 @@ def main(): for file in files: convert_one_file(file, tokenizer) + if __name__ == "__main__": main() diff --git a/flake.lock b/flake.lock index 343996da126e9..f56aab2c995b4 100644 --- a/flake.lock +++ b/flake.lock @@ -1,43 +1,34 @@ { - "nodes": { - "flake-utils": { - "locked": { - "lastModified": 1676283394, - "narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=", - "owner": "numtide", - "repo": "flake-utils", - "rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073", - "type": "github" - }, - "original": { - "owner": "numtide", - "repo": "flake-utils", - "type": "github" - } + "nodes": { + "flake-utils": { + "locked": { + "lastModified": 1676283394, + "narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073", + "type": "github", + }, + "original": {"owner": "numtide", "repo": "flake-utils", "type": "github"}, + }, + "nixpkgs": { + "locked": { + "lastModified": 1678470307, + "narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f", + "type": "github", + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github", + }, + }, + "root": {"inputs": {"flake-utils": "flake-utils", "nixpkgs": "nixpkgs"}}, }, - "nixpkgs": { - "locked": { - "lastModified": 1678470307, - "narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=", - "owner": "NixOS", - "repo": "nixpkgs", - "rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f", - "type": "github" - }, - "original": { - "owner": "NixOS", - "ref": "nixos-unstable", - "repo": "nixpkgs", - "type": "github" - } - }, - "root": { - "inputs": { - "flake-utils": "flake-utils", - "nixpkgs": "nixpkgs" - } - } - }, - "root": "root", - "version": 7 + "root": "root", + "version": 7, } diff --git a/quantize.py b/quantize.py index 641df8dda1b1e..b01307dbd192e 100644 --- a/quantize.py +++ b/quantize.py @@ -25,27 +25,36 @@ def main(): quantize_script_binary = "quantize" parser = argparse.ArgumentParser( - prog='python3 quantize.py', - description='This script quantizes the given models by applying the ' - f'"{quantize_script_binary}" script on them.' + prog="python3 quantize.py", + description="This script quantizes the given models by applying the " + f'"{quantize_script_binary}" script on them.', ) parser.add_argument( - 'models', nargs='+', choices=('7B', '13B', '30B', '65B'), - help='The models to quantize.' + "models", + nargs="+", + choices=("7B", "13B", "30B", "65B"), + help="The models to quantize.", ) parser.add_argument( - '-r', '--remove-16', action='store_true', dest='remove_f16', - help='Remove the f16 model after quantizing it.' + "-r", + "--remove-16", + action="store_true", + dest="remove_f16", + help="Remove the f16 model after quantizing it.", ) parser.add_argument( - '-m', '--models-path', dest='models_path', + "-m", + "--models-path", + dest="models_path", default=os.path.join(os.getcwd(), "models"), - help='Specify the directory where the models are located.' + help="Specify the directory where the models are located.", ) parser.add_argument( - '-q', '--quantize-script-path', dest='quantize_script_path', + "-q", + "--quantize-script-path", + dest="quantize_script_path", default=os.path.join(os.getcwd(), quantize_script_binary), - help='Specify the path to the "quantize" script.' + help='Specify the path to the "quantize" script.', ) # TODO: Revise this code @@ -75,12 +84,12 @@ def main(): ) if not os.path.isfile(f16_model_path_base): - print(f'The file %s was not found' % f16_model_path_base) + print(f"The file %s was not found" % f16_model_path_base) sys.exit(1) f16_model_parts_paths = map( lambda filename: os.path.join(f16_model_path_base, filename), - glob.glob(f"{f16_model_path_base}*") + glob.glob(f"{f16_model_path_base}*"), ) for f16_model_part_path in f16_model_parts_paths: @@ -93,9 +102,7 @@ def main(): ) sys.exit(1) - __run_quantize_script( - args.quantize_script_path, f16_model_part_path - ) + __run_quantize_script(args.quantize_script_path, f16_model_part_path) if args.remove_f16: os.remove(f16_model_part_path) @@ -104,6 +111,7 @@ def main(): # This was extracted to a top-level function for parallelization, if # implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406 + def __run_quantize_script(script_path, f16_model_part_path): """Run the quantize script specifying the path to it and the path to the f16 model to quantize. @@ -111,8 +119,7 @@ def __run_quantize_script(script_path, f16_model_part_path): new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0") subprocess.run( - [script_path, f16_model_part_path, new_quantized_model_path, "2"], - check=True + [script_path, f16_model_part_path, new_quantized_model_path, "2"], check=True )