Skip to content

Apply formatting to python code #611

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 41 additions & 11 deletions convert-ggml-to-pth.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,12 @@ def dequantize_weights(fin, n_rows, n_cols):

def read_variables(fin):
model = {}
pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables")
pbar = tqdm(
total=os.path.getsize(fin.name),
unit="B",
unit_scale=True,
desc="Reading variables",
)
while True:
start_pos = fin.tell()
try:
Expand All @@ -98,7 +103,9 @@ def read_variables(fin):
data_size = np.prod(shape)
data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)

model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16)
model[name] = torch.tensor(
data, dtype=torch.float32 if dtype == np.float32 else torch.float16
)

pbar.update(fin.tell() - start_pos)

Expand All @@ -112,11 +119,17 @@ def convert_to_hf_format(model, hparams):
dim = hparams["dim"]
dims_per_head = dim // n_heads
base = 10000.0
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
inv_freq = 1.0 / (
base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)
)

# permute for sliced rotary
def permute(w):
return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
return (
w.view(n_heads, dim // n_heads // 2, 2, dim)
.transpose(1, 2)
.reshape(dim, dim)
)

state_dict = {}
for layer_i in range(n_layers):
Expand Down Expand Up @@ -164,16 +177,22 @@ def permute(w):


def chat(model, hparams, llama_dir):
from transformers import (GenerationConfig, LlamaForCausalLM,
LlamaTokenizer, StoppingCriteria,
StoppingCriteriaList)
from transformers import (
GenerationConfig,
LlamaForCausalLM,
LlamaTokenizer,
StoppingCriteria,
StoppingCriteriaList,
)
from transformers.models.llama.configuration_llama import LlamaConfig

class StoppingCriteriaSub(StoppingCriteria):
def __init__(self):
super().__init__()

def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]):
def __call__(
self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]
):
print(tokenizer.decode(input_ids[0]), end="", flush=True)
if input_ids[0][-1] == 13:
return True
Expand Down Expand Up @@ -237,7 +256,11 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files."
"--input_dir",
"-i",
type=str,
required=True,
help="The input directory containing the ggml files.",
)
parser.add_argument(
"--prefix",
Expand All @@ -252,14 +275,21 @@ def main():
help="Whether to save the model in the huggingface format. (default: False)",
)
parser.add_argument(
"--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
"--chat",
"-c",
action="store_true",
help="Whether to open a chat with the model. (default: False)",
)
args = parser.parse_args()

llama_dir = os.path.abspath(f"{args.input_dir}/../")

ggml_files = sorted(
[f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)]
[
f"{args.input_dir}/{f}"
for f in os.listdir(args.input_dir)
if f.startswith(args.prefix)
]
)

fin = open(ggml_files[0], "rb")
Expand Down
29 changes: 20 additions & 9 deletions convert-gpt4all-to-ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,37 +15,43 @@

HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]


def parse_args():
parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
parser = argparse.ArgumentParser(
description="Upgrade a GPT4All model to the current format"
)
parser.add_argument("gpt4all_model", help="path to gpt4all-lora-quantized.bin")
parser.add_argument("tokenizer_model", help="path to LLaMA tokenizer.model file")
return parser.parse_args()


def read_header(f_in):
struct_fmt = "i" * (3 + len(HPARAMS))
struct_size = struct.calcsize(struct_fmt)
buf = f_in.read(struct_size)
return struct.unpack(struct_fmt, buf)


def write_header(f_out, header):
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header

if magic != 0x67676d6c:
raise Exception('Invalid file magic. Must be an old style ggml file.')
if magic != 0x67676D6C:
raise Exception("Invalid file magic. Must be an old style ggml file.")

values = [
0x67676d66, # magic: ggml in hex
1, # file version
0x67676D66, # magic: ggml in hex
1, # file version
vocab_size,
dim,
multiple_of,
n_heads,
n_layers,
rot,
ftype
ftype,
]
f_out.write(struct.pack("i" * len(values), *values))


def write_tokens(fout, tokenizer):
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
Expand All @@ -71,22 +77,25 @@ def write_tokens(fout, tokenizer):
fout.write(text)
fout.write(struct.pack("f", 0.0))


def read_tokens(f_in, tokenizer):
for i in range(tokenizer.vocab_size()):
len_b = f_in.read(4)
(length,) = struct.unpack("i", len_b)
f_in.read(length)


def copy_all_data(f_out, f_in):
while True:
buf = f_in.read(1024 * 1024)
if not buf:
break
f_out.write(buf)


def convert_one_file(path_in, tokenizer):
path_tmp = f"{path_in}.tmp"
path_orig= f"{path_in}.orig"
path_orig = f"{path_in}.orig"
print(f"converting {path_in}")
with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
write_header(f_out, read_header(f_in))
Expand All @@ -96,12 +105,14 @@ def convert_one_file(path_in, tokenizer):
os.rename(path_in, path_orig)
os.rename(path_tmp, path_in)


def main():
args = parse_args()

tokenizer = SentencePieceProcessor(args.tokenizer_model)

convert_one_file(args.gpt4all_model, tokenizer)


if __name__ == "__main__":
main()
69 changes: 48 additions & 21 deletions convert-gptq-to-ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@

model = torch.load(fname_model, map_location="cpu")

n_vocab, n_embd = model['model.embed_tokens.weight'].shape
n_layer = 1 + max(int(m.group(1)) for name in model
if (m := re.match(r'model\.layers\.([0-9]+)', name)))
n_vocab, n_embd = model["model.embed_tokens.weight"].shape
n_layer = 1 + max(
int(m.group(1))
for name in model
if (m := re.match(r"model\.layers\.([0-9]+)", name))
)

# hardcoded:
n_mult = 256
Expand All @@ -36,14 +39,14 @@

fout = open(fname_out, "wb")

fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
fout.write(struct.pack("i", 1)) # file version
fout.write(struct.pack("i", 0x67676D66)) # magic: ggmf in hex
fout.write(struct.pack("i", 1)) # file version
fout.write(struct.pack("i", n_vocab))
fout.write(struct.pack("i", n_embd))
fout.write(struct.pack("i", n_mult))
fout.write(struct.pack("i", n_head))
fout.write(struct.pack("i", n_layer))
fout.write(struct.pack("i", n_embd // n_head)) # rot (obsolete)
fout.write(struct.pack("i", n_embd // n_head)) # rot (obsolete)
fout.write(struct.pack("i", 4))


Expand All @@ -66,16 +69,23 @@
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))


def write_header(shape, dst_name, ftype_cur):
sname = dst_name.encode('utf-8')
sname = dst_name.encode("utf-8")
fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
fout.write(sname)


def convert_non_q4(src_name, dst_name):
v = model[src_name]
shape = v.shape
print("Processing non-Q4 variable: " + src_name + " with shape: ", shape, " and type: ", v.dtype)
print(
"Processing non-Q4 variable: " + src_name + " with shape: ",
shape,
" and type: ",
v.dtype,
)
if len(shape) == 1:
print(" Converting to float32")
v = v.to(torch.float32)
Expand All @@ -88,11 +98,12 @@ def convert_non_q4(src_name, dst_name):
# data
v.numpy().tofile(fout)


def convert_q4(src_name, dst_name, permute=False):
zeros = model[f"{src_name}.zeros"].numpy()
scales = model[f"{src_name}.scales"].numpy()
bias = model[f"{src_name}.bias"].numpy()
qweight = model[f"{src_name}.qweight"].numpy().T # transpose
qweight = model[f"{src_name}.qweight"].numpy().T # transpose

# Q4_1 does not support bias; good thing the bias is always all zeros.
assert not np.any(bias)
Expand All @@ -113,7 +124,7 @@ def convert_q4(src_name, dst_name, permute=False):
# the columns in a row, so we end up wasting quite a bit of memory with
# repeated scales and addends.

addends = -zeros # flip sign
addends = -zeros # flip sign

# Since the output format is mixed between integers and floats, we have
# to hackily view the floats as int32s just so numpy will let us
Expand All @@ -128,37 +139,53 @@ def convert_q4(src_name, dst_name, permute=False):
addends_rep = np.atleast_3d(addends_view).repeat(grouped.shape[1], axis=1)
scales_rep = np.atleast_3d(scales_view).repeat(grouped.shape[1], axis=1)

blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting='no')
blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting="no")

if permute:
# Permute some rows to undo the permutation done by convert_llama_weights_to_hf.py.
# This can be done after the above conversion because it doesn't affect column order/layout.
blob = (blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:])
.swapaxes(1, 2)
.reshape(blob.shape))
blob = (
blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:])
.swapaxes(1, 2)
.reshape(blob.shape)
)

# header
write_header(shape, dst_name, 3) # ftype = Q4_1
write_header(shape, dst_name, 3) # ftype = Q4_1

# data
blob.tofile(fout)


convert_non_q4("model.embed_tokens.weight", "tok_embeddings.weight")
convert_non_q4("model.norm.weight", "norm.weight")
convert_non_q4("lm_head.weight", "output.weight")

for i in range(n_layer):
convert_q4(f"model.layers.{i}.self_attn.q_proj", f"layers.{i}.attention.wq.weight", permute=True)
convert_q4(f"model.layers.{i}.self_attn.k_proj", f"layers.{i}.attention.wk.weight", permute=True)
convert_q4(
f"model.layers.{i}.self_attn.q_proj",
f"layers.{i}.attention.wq.weight",
permute=True,
)
convert_q4(
f"model.layers.{i}.self_attn.k_proj",
f"layers.{i}.attention.wk.weight",
permute=True,
)
convert_q4(f"model.layers.{i}.self_attn.v_proj", f"layers.{i}.attention.wv.weight")
convert_q4(f"model.layers.{i}.self_attn.o_proj", f"layers.{i}.attention.wo.weight")

convert_q4(f"model.layers.{i}.mlp.gate_proj", f"layers.{i}.feed_forward.w1.weight")
convert_q4(f"model.layers.{i}.mlp.down_proj", f"layers.{i}.feed_forward.w2.weight")
convert_q4(f"model.layers.{i}.mlp.up_proj", f"layers.{i}.feed_forward.w3.weight")

convert_non_q4(f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight")
convert_non_q4(f"model.layers.{i}.post_attention_layernorm.weight", f"layers.{i}.ffn_norm.weight")
convert_q4(f"model.layers.{i}.mlp.up_proj", f"layers.{i}.feed_forward.w3.weight")

convert_non_q4(
f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight"
)
convert_non_q4(
f"model.layers.{i}.post_attention_layernorm.weight",
f"layers.{i}.ffn_norm.weight",
)


fout.close()
Expand Down
Loading