gtamer2 · gtamer2 · Dec 14, 2023 · Dec 13, 2023 · Dec 13, 2023 · Dec 13, 2023
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,7 @@
+llama-2-7b/
+tokenizer.model
+tokenizer_checklist.chk
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/inference_benchmark.py b/inference_benchmark.py
@@ -1,16 +1,18 @@
 import torch
 from torch.utils.data import DataLoader
 import time
-import cotracker.models.build_cotracker
-from cotracker.datasets.tap_vid_datasets import TapVidDataset
-import os 
-from cotracker.datasets.utils import collate_fn
 from datasets import load_dataset
+import fire
+from torch.profiler import profile, record_function, ProfilerActivity
 
 ### Setup ###
 BATCH_SIZE = 1
 BATCH_COUNT = 5
 NUM_WORKERS = 1
+PROFILE_MEMORY = True
+
+# https://huggingface.co/datasets/gsm8k
+HUGGING_FACE_GSMK_DATASET_ID = "gsm8k"
 
 # Manual seed for reproducatibility
 SEED = 42
@@ -20,45 +22,55 @@
 DEVICE_CUDA = 'cuda'
 DEVICE_CPU = 'cpu'
 
+from llama import Llama
+from typing import List
+
+def get_device():
+    return torch.device(DEVICE_CUDA if torch.cuda.is_available() else DEVICE_CPU)
 
 def get_data_loader(num_workers=1):
-    dataset = load_dataset("HuggingFaceH4/no_robots")
+    dataset = load_dataset(HUGGING_FACE_GSMK_DATASET_ID, 'main')['train']
     dataloader = DataLoader(
         dataset,
         batch_size=BATCH_SIZE,
         shuffle=False,
-        num_workers=num_workers,
-        collate_fn=collate_fn,
+        num_workers=num_workers
     )
     return dataloader
 
 
-def get_model(checkpoint_path=CHECKPOINT_S4_W12):
-    return cotracker.models.build_cotracker.build_cotracker(checkpoint_path)
+def get_model(ckpt_dir, tokenizer_path, max_seq_len, max_batch_size):
+    generator = Llama.build(
+        ckpt_dir=ckpt_dir,
+        tokenizer_path=tokenizer_path,
+        max_seq_len=max_seq_len,
+        max_batch_size=max_batch_size,
+    )
+    return generator
 
 
-def run_inference(dataloader, model, cuda=True):
+def run_benchmark(dataloader, model):
     load_time_per_batch = torch.zeros(BATCH_COUNT)
     inference_time_per_batch = torch.zeros(BATCH_COUNT)
     total_time_per_batch = torch.zeros(BATCH_COUNT)
 
-    device = DEVICE_CUDA if cuda else DEVICE_CPU
+    device = get_device()
+    # model.to(device)
     print("Working on device: {}".format(device))
-    model.to(device)
+
 
     for batch_idx in range(BATCH_COUNT):
-        print("Starting BATCHs {} of {}".format(batch_idx + 1, BATCH_COUNT))
-        (output, load_time, train_time), batch_time = measure_runtime(run_batch_inference,
+        print("Starting BATCH {} of {}".format(batch_idx + 1, BATCH_COUNT))
+        (output, load_time, inference_time), batch_time = measure_runtime(run_batch_inference,
                                                               dataloader,
-                                                              model,
-                                                              cuda)
+                                                              model)
         load_time_per_batch[batch_idx] = load_time
-        inference_time_per_batch[batch_idx] = train_time
+        inference_time_per_batch[batch_idx] = inference_time
         total_time_per_batch[batch_idx] = batch_time
 
         print("Finished Batch {} of {}".format(batch_idx + 1, BATCH_COUNT))
         print("Batch load time: {}".format(load_time))
-        print("Batch inference time: {}".format(train_time))
+        print("Batch inference time: {}".format(inference_time))
         print("Batch total time: {}".format(batch_time))
     return model, load_time_per_batch, inference_time_per_batch, total_time_per_batch
 
@@ -71,46 +83,85 @@ def measure_runtime(func, *func_args):
     return result, elapsed
 
 
-def run_batch_inference(dataloader, model, cuda=True):
-    (x, y), load_time = measure_runtime(
+def run_batch_inference(dataloader, model):
+    (question, answer), load_time = measure_runtime(
         __get_next_batch, dataloader)
 
-    if cuda:
-        x = x.to(DEVICE_CUDA)
-        y = y.to(DEVICE_CUDA)
-
-    output, train_time = measure_runtime(
+
+    # print("question: ", question, "\nanswer: ", answer)
+    # print("question type: ", type(question), "answer type", type(answer))
+    # print("question shape: ", len(question), "answer shape", len(answer))
+    # device = get_device()
+    # x = x.to(device)
+    # y = y.to(device)
+
+    output, inference_time = measure_runtime(
+        inference,
         model,
-        x)
+        [question])
 
-    return output, load_time, train_time
+    return output, load_time, inference_time
+
+def inference(
+    generator: Llama,
+    prompts: List[str],
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    max_gen_len: int = 64,
+):
+    with torch.no_grad():
+        results = generator.text_completion(
+            prompts,
+            max_gen_len=max_gen_len,
+            temperature=temperature,
+            top_p=top_p,
+        )
+        return zip(prompts, results)
 
 def __get_next_batch(dataloader):
     return next(iter(dataloader))
 
 
-def benchmark():
+def benchmark(ckpt_dir, 
+              tokenizer_path, 
+              max_seq_len, 
+              max_batch_size):
     print("Starting up...")
 
     print("Building data loaders...")
     data_loader = get_data_loader()
 
     print("Initializing Model...")
-    net = get_model()
+    net = get_model(ckpt_dir, tokenizer_path, max_seq_len, max_batch_size)
 
     print("Running inference benchmark...\n")
-    _, load, inference, total = run_batch_inference(data_loader, net)
-
-    print("Results...")
-    print("C2.1: Data-loading times")
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, profile_memory=PROFILE_MEMORY) as prof:
+        # with record_function("run_benchmark"):
+        #     _, load, inference, total = run_benchmark(data_loader, net)
+        _, load, inference, total = run_benchmark(data_loader, net)
+
+    print("\n\n Manual Profile Results...")
+    print("Data-loading times")
     print("> per epoch: ", load)
     print("> average: ", torch.mean(load))
-    print("C2.2: Training time for each epoch")
+    print("\nInference time for each epoch")
     print("> per epoch", inference)
     print("> average", torch.mean(inference))
-    print("C2.3: Total time for each epoch")
+    print("\nTotal time for each epoch")
     print("> per epoch", total)
     print("> average", torch.mean(total))
 
+    print("\n\n")
+    print("Profiling sorted by CUDA time total")
+    profile_cuda_time = prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)
+    print(profile_cuda_time)
+
+    print("\n\n")
+    print("Profiling sorted by CUDA memory usage")
+    profile_cuda_mem = prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10)
+    print(profile_cuda_mem)
+
+
 if __name__ == "__main__":
-    benchmark()
+    fire.Fire(benchmark)