diff --git a/convert-ggml-to-pth.py b/convert-ggml-to-pth.py
index 20158c9ca8650..8cc67785140f3 100644
--- a/convert-ggml-to-pth.py
+++ b/convert-ggml-to-pth.py
@@ -72,7 +72,12 @@ def dequantize_weights(fin, n_rows, n_cols):
 
 def read_variables(fin):
     model = {}
-    pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables")
+    pbar = tqdm(
+        total=os.path.getsize(fin.name),
+        unit="B",
+        unit_scale=True,
+        desc="Reading variables",
+    )
     while True:
         start_pos = fin.tell()
         try:
@@ -98,7 +103,9 @@ def read_variables(fin):
             data_size = np.prod(shape)
             data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
 
-        model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16)
+        model[name] = torch.tensor(
+            data, dtype=torch.float32 if dtype == np.float32 else torch.float16
+        )
 
         pbar.update(fin.tell() - start_pos)
 
@@ -112,11 +119,17 @@ def convert_to_hf_format(model, hparams):
     dim = hparams["dim"]
     dims_per_head = dim // n_heads
     base = 10000.0
-    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+    inv_freq = 1.0 / (
+        base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)
+    )
 
     # permute for sliced rotary
     def permute(w):
-        return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
+        return (
+            w.view(n_heads, dim // n_heads // 2, 2, dim)
+            .transpose(1, 2)
+            .reshape(dim, dim)
+        )
 
     state_dict = {}
     for layer_i in range(n_layers):
@@ -164,16 +177,22 @@ def permute(w):
 
 
 def chat(model, hparams, llama_dir):
-    from transformers import (GenerationConfig, LlamaForCausalLM,
-                              LlamaTokenizer, StoppingCriteria,
-                              StoppingCriteriaList)
+    from transformers import (
+        GenerationConfig,
+        LlamaForCausalLM,
+        LlamaTokenizer,
+        StoppingCriteria,
+        StoppingCriteriaList,
+    )
     from transformers.models.llama.configuration_llama import LlamaConfig
 
     class StoppingCriteriaSub(StoppingCriteria):
         def __init__(self):
             super().__init__()
 
-        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]):
+        def __call__(
+            self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]
+        ):
             print(tokenizer.decode(input_ids[0]), end="", flush=True)
             if input_ids[0][-1] == 13:
                 return True
@@ -237,7 +256,11 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files."
+        "--input_dir",
+        "-i",
+        type=str,
+        required=True,
+        help="The input directory containing the ggml files.",
     )
     parser.add_argument(
         "--prefix",
@@ -252,14 +275,21 @@ def main():
         help="Whether to save the model in the huggingface format. (default: False)",
     )
     parser.add_argument(
-        "--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
+        "--chat",
+        "-c",
+        action="store_true",
+        help="Whether to open a chat with the model. (default: False)",
     )
     args = parser.parse_args()
 
     llama_dir = os.path.abspath(f"{args.input_dir}/../")
 
     ggml_files = sorted(
-        [f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)]
+        [
+            f"{args.input_dir}/{f}"
+            for f in os.listdir(args.input_dir)
+            if f.startswith(args.prefix)
+        ]
     )
 
     fin = open(ggml_files[0], "rb")
diff --git a/convert-gpt4all-to-ggml.py b/convert-gpt4all-to-ggml.py
index f1d9d7aefe3e0..555ab67d015df 100644
--- a/convert-gpt4all-to-ggml.py
+++ b/convert-gpt4all-to-ggml.py
@@ -15,37 +15,43 @@
 
 HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
 
+
 def parse_args():
-    parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
-    parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
-    parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
+    parser = argparse.ArgumentParser(
+        description="Upgrade a GPT4All model to the current format"
+    )
+    parser.add_argument("gpt4all_model", help="path to gpt4all-lora-quantized.bin")
+    parser.add_argument("tokenizer_model", help="path to LLaMA tokenizer.model file")
     return parser.parse_args()
 
+
 def read_header(f_in):
     struct_fmt = "i" * (3 + len(HPARAMS))
     struct_size = struct.calcsize(struct_fmt)
     buf = f_in.read(struct_size)
     return struct.unpack(struct_fmt, buf)
 
+
 def write_header(f_out, header):
     (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
 
-    if magic != 0x67676d6c:
-        raise Exception('Invalid file magic. Must be an old style ggml file.')
+    if magic != 0x67676D6C:
+        raise Exception("Invalid file magic. Must be an old style ggml file.")
 
     values = [
-        0x67676d66, # magic: ggml in hex
-        1,          # file version
+        0x67676D66,  # magic: ggml in hex
+        1,  # file version
         vocab_size,
         dim,
         multiple_of,
         n_heads,
         n_layers,
         rot,
-        ftype
+        ftype,
     ]
     f_out.write(struct.pack("i" * len(values), *values))
 
+
 def write_tokens(fout, tokenizer):
     for i in range(tokenizer.vocab_size()):
         if tokenizer.is_unknown(i):
@@ -71,12 +77,14 @@ def write_tokens(fout, tokenizer):
     fout.write(text)
     fout.write(struct.pack("f", 0.0))
 
+
 def read_tokens(f_in, tokenizer):
     for i in range(tokenizer.vocab_size()):
         len_b = f_in.read(4)
         (length,) = struct.unpack("i", len_b)
         f_in.read(length)
 
+
 def copy_all_data(f_out, f_in):
     while True:
         buf = f_in.read(1024 * 1024)
@@ -84,9 +92,10 @@ def copy_all_data(f_out, f_in):
             break
         f_out.write(buf)
 
+
 def convert_one_file(path_in, tokenizer):
     path_tmp = f"{path_in}.tmp"
-    path_orig= f"{path_in}.orig"
+    path_orig = f"{path_in}.orig"
     print(f"converting {path_in}")
     with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
         write_header(f_out, read_header(f_in))
@@ -96,6 +105,7 @@ def convert_one_file(path_in, tokenizer):
     os.rename(path_in, path_orig)
     os.rename(path_tmp, path_in)
 
+
 def main():
     args = parse_args()
 
@@ -103,5 +113,6 @@ def main():
 
     convert_one_file(args.gpt4all_model, tokenizer)
 
+
 if __name__ == "__main__":
     main()
diff --git a/convert-gptq-to-ggml.py b/convert-gptq-to-ggml.py
index 6c77808fcd186..1cc4e2fd93fa0 100644
--- a/convert-gptq-to-ggml.py
+++ b/convert-gptq-to-ggml.py
@@ -20,9 +20,12 @@
 
 model = torch.load(fname_model, map_location="cpu")
 
-n_vocab, n_embd = model['model.embed_tokens.weight'].shape
-n_layer = 1 + max(int(m.group(1)) for name in model
-                  if (m := re.match(r'model\.layers\.([0-9]+)', name)))
+n_vocab, n_embd = model["model.embed_tokens.weight"].shape
+n_layer = 1 + max(
+    int(m.group(1))
+    for name in model
+    if (m := re.match(r"model\.layers\.([0-9]+)", name))
+)
 
 # hardcoded:
 n_mult = 256
@@ -36,14 +39,14 @@
 
 fout = open(fname_out, "wb")
 
-fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
-fout.write(struct.pack("i", 1)) # file version
+fout.write(struct.pack("i", 0x67676D66))  # magic: ggmf in hex
+fout.write(struct.pack("i", 1))  # file version
 fout.write(struct.pack("i", n_vocab))
 fout.write(struct.pack("i", n_embd))
 fout.write(struct.pack("i", n_mult))
 fout.write(struct.pack("i", n_head))
 fout.write(struct.pack("i", n_layer))
-fout.write(struct.pack("i", n_embd // n_head)) # rot (obsolete)
+fout.write(struct.pack("i", n_embd // n_head))  # rot (obsolete)
 fout.write(struct.pack("i", 4))
 
 
@@ -66,16 +69,23 @@
     fout.write(text)
     fout.write(struct.pack("f", tokenizer.get_score(i)))
 
+
 def write_header(shape, dst_name, ftype_cur):
-    sname = dst_name.encode('utf-8')
+    sname = dst_name.encode("utf-8")
     fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
     fout.write(struct.pack("i" * len(shape), *shape[::-1]))
     fout.write(sname)
 
+
 def convert_non_q4(src_name, dst_name):
     v = model[src_name]
     shape = v.shape
-    print("Processing non-Q4 variable: " + src_name + " with shape: ", shape, " and type: ", v.dtype)
+    print(
+        "Processing non-Q4 variable: " + src_name + " with shape: ",
+        shape,
+        " and type: ",
+        v.dtype,
+    )
     if len(shape) == 1:
         print("  Converting to float32")
         v = v.to(torch.float32)
@@ -88,11 +98,12 @@ def convert_non_q4(src_name, dst_name):
     # data
     v.numpy().tofile(fout)
 
+
 def convert_q4(src_name, dst_name, permute=False):
     zeros = model[f"{src_name}.zeros"].numpy()
     scales = model[f"{src_name}.scales"].numpy()
     bias = model[f"{src_name}.bias"].numpy()
-    qweight = model[f"{src_name}.qweight"].numpy().T # transpose
+    qweight = model[f"{src_name}.qweight"].numpy().T  # transpose
 
     # Q4_1 does not support bias; good thing the bias is always all zeros.
     assert not np.any(bias)
@@ -113,7 +124,7 @@ def convert_q4(src_name, dst_name, permute=False):
     # the columns in a row, so we end up wasting quite a bit of memory with
     # repeated scales and addends.
 
-    addends = -zeros # flip sign
+    addends = -zeros  # flip sign
 
     # Since the output format is mixed between integers and floats, we have
     # to hackily view the floats as int32s just so numpy will let us
@@ -128,37 +139,53 @@ def convert_q4(src_name, dst_name, permute=False):
     addends_rep = np.atleast_3d(addends_view).repeat(grouped.shape[1], axis=1)
     scales_rep = np.atleast_3d(scales_view).repeat(grouped.shape[1], axis=1)
 
-    blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting='no')
+    blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting="no")
 
     if permute:
         # Permute some rows to undo the permutation done by convert_llama_weights_to_hf.py.
         # This can be done after the above conversion because it doesn't affect column order/layout.
-        blob = (blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:])
-                    .swapaxes(1, 2)
-                    .reshape(blob.shape))
+        blob = (
+            blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(blob.shape)
+        )
 
     # header
-    write_header(shape, dst_name, 3) # ftype = Q4_1
+    write_header(shape, dst_name, 3)  # ftype = Q4_1
 
     # data
     blob.tofile(fout)
 
+
 convert_non_q4("model.embed_tokens.weight", "tok_embeddings.weight")
 convert_non_q4("model.norm.weight", "norm.weight")
 convert_non_q4("lm_head.weight", "output.weight")
 
 for i in range(n_layer):
-    convert_q4(f"model.layers.{i}.self_attn.q_proj", f"layers.{i}.attention.wq.weight", permute=True)
-    convert_q4(f"model.layers.{i}.self_attn.k_proj", f"layers.{i}.attention.wk.weight", permute=True)
+    convert_q4(
+        f"model.layers.{i}.self_attn.q_proj",
+        f"layers.{i}.attention.wq.weight",
+        permute=True,
+    )
+    convert_q4(
+        f"model.layers.{i}.self_attn.k_proj",
+        f"layers.{i}.attention.wk.weight",
+        permute=True,
+    )
     convert_q4(f"model.layers.{i}.self_attn.v_proj", f"layers.{i}.attention.wv.weight")
     convert_q4(f"model.layers.{i}.self_attn.o_proj", f"layers.{i}.attention.wo.weight")
 
     convert_q4(f"model.layers.{i}.mlp.gate_proj", f"layers.{i}.feed_forward.w1.weight")
     convert_q4(f"model.layers.{i}.mlp.down_proj", f"layers.{i}.feed_forward.w2.weight")
-    convert_q4(f"model.layers.{i}.mlp.up_proj",   f"layers.{i}.feed_forward.w3.weight")
-
-    convert_non_q4(f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight")
-    convert_non_q4(f"model.layers.{i}.post_attention_layernorm.weight", f"layers.{i}.ffn_norm.weight")
+    convert_q4(f"model.layers.{i}.mlp.up_proj", f"layers.{i}.feed_forward.w3.weight")
+
+    convert_non_q4(
+        f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight"
+    )
+    convert_non_q4(
+        f"model.layers.{i}.post_attention_layernorm.weight",
+        f"layers.{i}.ffn_norm.weight",
+    )
 
 
 fout.close()
diff --git a/convert-unversioned-ggml-to-ggml.py b/convert-unversioned-ggml-to-ggml.py
index 33b6243bd94e0..1c1b133746d18 100644
--- a/convert-unversioned-ggml-to-ggml.py
+++ b/convert-unversioned-ggml-to-ggml.py
@@ -10,37 +10,43 @@
 
 HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
 
+
 def parse_args():
-    parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format')
-    parser.add_argument('dir_model', help='directory containing ggml .bin files')
-    parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
+    parser = argparse.ArgumentParser(
+        description="Upgrade old ggml model files to the current format"
+    )
+    parser.add_argument("dir_model", help="directory containing ggml .bin files")
+    parser.add_argument("tokenizer_model", help="path to LLaMA tokenizer.model file")
     return parser.parse_args()
 
+
 def read_header(f_in):
     struct_fmt = "i" * (3 + len(HPARAMS))
     struct_size = struct.calcsize(struct_fmt)
     buf = f_in.read(struct_size)
     return struct.unpack(struct_fmt, buf)
 
+
 def write_header(f_out, header):
     (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
 
-    if magic != 0x67676d6c:
-        raise Exception('Invalid file magic. Must be an old style ggml file.')
+    if magic != 0x67676D6C:
+        raise Exception("Invalid file magic. Must be an old style ggml file.")
 
     values = [
-        0x67676d66,  # magic: ggml in hex
-        1, # file version
+        0x67676D66,  # magic: ggml in hex
+        1,  # file version
         vocab_size,
         dim,
         multiple_of,
         n_heads,
         n_layers,
         rot,
-        ftype
+        ftype,
     ]
     f_out.write(struct.pack("i" * len(values), *values))
 
+
 def write_tokens(fout, tokenizer):
     for i in range(tokenizer.vocab_size()):
         if tokenizer.is_unknown(i):
@@ -60,12 +66,14 @@ def write_tokens(fout, tokenizer):
         fout.write(text)
         fout.write(struct.pack("f", tokenizer.get_score(i)))
 
+
 def read_tokens(f_in, tokenizer):
     for i in range(tokenizer.vocab_size()):
         len_b = f_in.read(4)
         (length,) = struct.unpack("i", len_b)
         f_in.read(length)
 
+
 def copy_all_data(f_out, f_in):
     while True:
         buf = f_in.read(1024 * 1024)
@@ -73,9 +81,10 @@ def copy_all_data(f_out, f_in):
             break
         f_out.write(buf)
 
+
 def convert_one_file(path_in, tokenizer):
     path_tmp = f"{path_in}.tmp"
-    path_orig= f"{path_in}.orig"
+    path_orig = f"{path_in}.orig"
     print(f"converting {path_in}")
     with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
         write_header(f_out, read_header(f_in))
@@ -85,6 +94,7 @@ def convert_one_file(path_in, tokenizer):
     os.rename(path_in, path_orig)
     os.rename(path_tmp, path_in)
 
+
 def main():
     args = parse_args()
     files = []
@@ -96,5 +106,6 @@ def main():
     for file in files:
         convert_one_file(file, tokenizer)
 
+
 if __name__ == "__main__":
     main()
diff --git a/flake.lock b/flake.lock
index 343996da126e9..f56aab2c995b4 100644
--- a/flake.lock
+++ b/flake.lock
@@ -1,43 +1,34 @@
 {
-  "nodes": {
-    "flake-utils": {
-      "locked": {
-        "lastModified": 1676283394,
-        "narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
+    "nodes": {
+        "flake-utils": {
+            "locked": {
+                "lastModified": 1676283394,
+                "narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=",
+                "owner": "numtide",
+                "repo": "flake-utils",
+                "rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073",
+                "type": "github",
+            },
+            "original": {"owner": "numtide", "repo": "flake-utils", "type": "github"},
+        },
+        "nixpkgs": {
+            "locked": {
+                "lastModified": 1678470307,
+                "narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=",
+                "owner": "NixOS",
+                "repo": "nixpkgs",
+                "rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f",
+                "type": "github",
+            },
+            "original": {
+                "owner": "NixOS",
+                "ref": "nixos-unstable",
+                "repo": "nixpkgs",
+                "type": "github",
+            },
+        },
+        "root": {"inputs": {"flake-utils": "flake-utils", "nixpkgs": "nixpkgs"}},
     },
-    "nixpkgs": {
-      "locked": {
-        "lastModified": 1678470307,
-        "narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=",
-        "owner": "NixOS",
-        "repo": "nixpkgs",
-        "rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f",
-        "type": "github"
-      },
-      "original": {
-        "owner": "NixOS",
-        "ref": "nixos-unstable",
-        "repo": "nixpkgs",
-        "type": "github"
-      }
-    },
-    "root": {
-      "inputs": {
-        "flake-utils": "flake-utils",
-        "nixpkgs": "nixpkgs"
-      }
-    }
-  },
-  "root": "root",
-  "version": 7
+    "root": "root",
+    "version": 7,
 }
diff --git a/quantize.py b/quantize.py
index 641df8dda1b1e..b01307dbd192e 100644
--- a/quantize.py
+++ b/quantize.py
@@ -25,27 +25,36 @@ def main():
         quantize_script_binary = "quantize"
 
     parser = argparse.ArgumentParser(
-        prog='python3 quantize.py',
-        description='This script quantizes the given models by applying the '
-        f'"{quantize_script_binary}" script on them.'
+        prog="python3 quantize.py",
+        description="This script quantizes the given models by applying the "
+        f'"{quantize_script_binary}" script on them.',
     )
     parser.add_argument(
-        'models', nargs='+', choices=('7B', '13B', '30B', '65B'),
-        help='The models to quantize.'
+        "models",
+        nargs="+",
+        choices=("7B", "13B", "30B", "65B"),
+        help="The models to quantize.",
     )
     parser.add_argument(
-        '-r', '--remove-16', action='store_true', dest='remove_f16',
-        help='Remove the f16 model after quantizing it.'
+        "-r",
+        "--remove-16",
+        action="store_true",
+        dest="remove_f16",
+        help="Remove the f16 model after quantizing it.",
     )
     parser.add_argument(
-        '-m', '--models-path', dest='models_path',
+        "-m",
+        "--models-path",
+        dest="models_path",
         default=os.path.join(os.getcwd(), "models"),
-        help='Specify the directory where the models are located.'
+        help="Specify the directory where the models are located.",
     )
     parser.add_argument(
-        '-q', '--quantize-script-path', dest='quantize_script_path',
+        "-q",
+        "--quantize-script-path",
+        dest="quantize_script_path",
         default=os.path.join(os.getcwd(), quantize_script_binary),
-        help='Specify the path to the "quantize" script.'
+        help='Specify the path to the "quantize" script.',
     )
 
     # TODO: Revise this code
@@ -75,12 +84,12 @@ def main():
         )
 
         if not os.path.isfile(f16_model_path_base):
-            print(f'The file %s was not found' % f16_model_path_base)
+            print(f"The file %s was not found" % f16_model_path_base)
             sys.exit(1)
 
         f16_model_parts_paths = map(
             lambda filename: os.path.join(f16_model_path_base, filename),
-            glob.glob(f"{f16_model_path_base}*")
+            glob.glob(f"{f16_model_path_base}*"),
         )
 
         for f16_model_part_path in f16_model_parts_paths:
@@ -93,9 +102,7 @@ def main():
                 )
                 sys.exit(1)
 
-            __run_quantize_script(
-                args.quantize_script_path, f16_model_part_path
-            )
+            __run_quantize_script(args.quantize_script_path, f16_model_part_path)
 
             if args.remove_f16:
                 os.remove(f16_model_part_path)
@@ -104,6 +111,7 @@ def main():
 # This was extracted to a top-level function for parallelization, if
 # implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406
 
+
 def __run_quantize_script(script_path, f16_model_part_path):
     """Run the quantize script specifying the path to it and the path to the
     f16 model to quantize.
@@ -111,8 +119,7 @@ def __run_quantize_script(script_path, f16_model_part_path):
 
     new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0")
     subprocess.run(
-        [script_path, f16_model_part_path, new_quantized_model_path, "2"],
-        check=True
+        [script_path, f16_model_part_path, new_quantized_model_path, "2"], check=True
     )