Skip to content

Benchmark2 #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
llama-2-7b/
tokenizer.model
tokenizer_checklist.chk

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
123 changes: 87 additions & 36 deletions inference_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
import torch
from torch.utils.data import DataLoader
import time
import cotracker.models.build_cotracker
from cotracker.datasets.tap_vid_datasets import TapVidDataset
import os
from cotracker.datasets.utils import collate_fn
from datasets import load_dataset
import fire
from torch.profiler import profile, record_function, ProfilerActivity

### Setup ###
BATCH_SIZE = 1
BATCH_COUNT = 5
NUM_WORKERS = 1
PROFILE_MEMORY = True

# https://huggingface.co/datasets/gsm8k
HUGGING_FACE_GSMK_DATASET_ID = "gsm8k"

# Manual seed for reproducatibility
SEED = 42
Expand All @@ -20,45 +22,55 @@
DEVICE_CUDA = 'cuda'
DEVICE_CPU = 'cpu'

from llama import Llama
from typing import List

def get_device():
return torch.device(DEVICE_CUDA if torch.cuda.is_available() else DEVICE_CPU)

def get_data_loader(num_workers=1):
dataset = load_dataset("HuggingFaceH4/no_robots")
dataset = load_dataset(HUGGING_FACE_GSMK_DATASET_ID, 'main')['train']
dataloader = DataLoader(
dataset,
batch_size=BATCH_SIZE,
shuffle=False,
num_workers=num_workers,
collate_fn=collate_fn,
num_workers=num_workers
)
return dataloader


def get_model(checkpoint_path=CHECKPOINT_S4_W12):
return cotracker.models.build_cotracker.build_cotracker(checkpoint_path)
def get_model(ckpt_dir, tokenizer_path, max_seq_len, max_batch_size):
generator = Llama.build(
ckpt_dir=ckpt_dir,
tokenizer_path=tokenizer_path,
max_seq_len=max_seq_len,
max_batch_size=max_batch_size,
)
return generator


def run_inference(dataloader, model, cuda=True):
def run_benchmark(dataloader, model):
load_time_per_batch = torch.zeros(BATCH_COUNT)
inference_time_per_batch = torch.zeros(BATCH_COUNT)
total_time_per_batch = torch.zeros(BATCH_COUNT)

device = DEVICE_CUDA if cuda else DEVICE_CPU
device = get_device()
# model.to(device)
print("Working on device: {}".format(device))
model.to(device)


for batch_idx in range(BATCH_COUNT):
print("Starting BATCHs {} of {}".format(batch_idx + 1, BATCH_COUNT))
(output, load_time, train_time), batch_time = measure_runtime(run_batch_inference,
print("Starting BATCH {} of {}".format(batch_idx + 1, BATCH_COUNT))
(output, load_time, inference_time), batch_time = measure_runtime(run_batch_inference,
dataloader,
model,
cuda)
model)
load_time_per_batch[batch_idx] = load_time
inference_time_per_batch[batch_idx] = train_time
inference_time_per_batch[batch_idx] = inference_time
total_time_per_batch[batch_idx] = batch_time

print("Finished Batch {} of {}".format(batch_idx + 1, BATCH_COUNT))
print("Batch load time: {}".format(load_time))
print("Batch inference time: {}".format(train_time))
print("Batch inference time: {}".format(inference_time))
print("Batch total time: {}".format(batch_time))
return model, load_time_per_batch, inference_time_per_batch, total_time_per_batch

Expand All @@ -71,46 +83,85 @@ def measure_runtime(func, *func_args):
return result, elapsed


def run_batch_inference(dataloader, model, cuda=True):
(x, y), load_time = measure_runtime(
def run_batch_inference(dataloader, model):
(question, answer), load_time = measure_runtime(
__get_next_batch, dataloader)

if cuda:
x = x.to(DEVICE_CUDA)
y = y.to(DEVICE_CUDA)

output, train_time = measure_runtime(

# print("question: ", question, "\nanswer: ", answer)
# print("question type: ", type(question), "answer type", type(answer))
# print("question shape: ", len(question), "answer shape", len(answer))
# device = get_device()
# x = x.to(device)
# y = y.to(device)

output, inference_time = measure_runtime(
inference,
model,
x)
[question])

return output, load_time, train_time
return output, load_time, inference_time

def inference(
generator: Llama,
prompts: List[str],
temperature: float = 0.6,
top_p: float = 0.9,
max_gen_len: int = 64,
):
with torch.no_grad():
results = generator.text_completion(
prompts,
max_gen_len=max_gen_len,
temperature=temperature,
top_p=top_p,
)
return zip(prompts, results)

def __get_next_batch(dataloader):
return next(iter(dataloader))


def benchmark():
def benchmark(ckpt_dir,
tokenizer_path,
max_seq_len,
max_batch_size):
print("Starting up...")

print("Building data loaders...")
data_loader = get_data_loader()

print("Initializing Model...")
net = get_model()
net = get_model(ckpt_dir, tokenizer_path, max_seq_len, max_batch_size)

print("Running inference benchmark...\n")
_, load, inference, total = run_batch_inference(data_loader, net)

print("Results...")
print("C2.1: Data-loading times")

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, profile_memory=PROFILE_MEMORY) as prof:
# with record_function("run_benchmark"):
# _, load, inference, total = run_benchmark(data_loader, net)
_, load, inference, total = run_benchmark(data_loader, net)

print("\n\n Manual Profile Results...")
print("Data-loading times")
print("> per epoch: ", load)
print("> average: ", torch.mean(load))
print("C2.2: Training time for each epoch")
print("\nInference time for each epoch")
print("> per epoch", inference)
print("> average", torch.mean(inference))
print("C2.3: Total time for each epoch")
print("\nTotal time for each epoch")
print("> per epoch", total)
print("> average", torch.mean(total))

print("\n\n")
print("Profiling sorted by CUDA time total")
profile_cuda_time = prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)
print(profile_cuda_time)

print("\n\n")
print("Profiling sorted by CUDA memory usage")
profile_cuda_mem = prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10)
print(profile_cuda_mem)


if __name__ == "__main__":
benchmark()
fire.Fire(benchmark)