Skip to content

Commit b988f0e

Browse files
authored
Merge pull request #2 from gtamer2/benchmark2
Benchmark2
2 parents 64ab5a8 + d7da6e1 commit b988f0e

File tree

2 files changed

+91
-36
lines changed

2 files changed

+91
-36
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
llama-2-7b/
2+
tokenizer.model
3+
tokenizer_checklist.chk
4+
15
# Byte-compiled / optimized / DLL files
26
__pycache__/
37
*.py[cod]

inference_benchmark.py

Lines changed: 87 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,18 @@
11
import torch
22
from torch.utils.data import DataLoader
33
import time
4-
import cotracker.models.build_cotracker
5-
from cotracker.datasets.tap_vid_datasets import TapVidDataset
6-
import os
7-
from cotracker.datasets.utils import collate_fn
84
from datasets import load_dataset
5+
import fire
6+
from torch.profiler import profile, record_function, ProfilerActivity
97

108
### Setup ###
119
BATCH_SIZE = 1
1210
BATCH_COUNT = 5
1311
NUM_WORKERS = 1
12+
PROFILE_MEMORY = True
13+
14+
# https://huggingface.co/datasets/gsm8k
15+
HUGGING_FACE_GSMK_DATASET_ID = "gsm8k"
1416

1517
# Manual seed for reproducatibility
1618
SEED = 42
@@ -20,45 +22,55 @@
2022
DEVICE_CUDA = 'cuda'
2123
DEVICE_CPU = 'cpu'
2224

25+
from llama import Llama
26+
from typing import List
27+
28+
def get_device():
29+
return torch.device(DEVICE_CUDA if torch.cuda.is_available() else DEVICE_CPU)
2330

2431
def get_data_loader(num_workers=1):
25-
dataset = load_dataset("HuggingFaceH4/no_robots")
32+
dataset = load_dataset(HUGGING_FACE_GSMK_DATASET_ID, 'main')['train']
2633
dataloader = DataLoader(
2734
dataset,
2835
batch_size=BATCH_SIZE,
2936
shuffle=False,
30-
num_workers=num_workers,
31-
collate_fn=collate_fn,
37+
num_workers=num_workers
3238
)
3339
return dataloader
3440

3541

36-
def get_model(checkpoint_path=CHECKPOINT_S4_W12):
37-
return cotracker.models.build_cotracker.build_cotracker(checkpoint_path)
42+
def get_model(ckpt_dir, tokenizer_path, max_seq_len, max_batch_size):
43+
generator = Llama.build(
44+
ckpt_dir=ckpt_dir,
45+
tokenizer_path=tokenizer_path,
46+
max_seq_len=max_seq_len,
47+
max_batch_size=max_batch_size,
48+
)
49+
return generator
3850

3951

40-
def run_inference(dataloader, model, cuda=True):
52+
def run_benchmark(dataloader, model):
4153
load_time_per_batch = torch.zeros(BATCH_COUNT)
4254
inference_time_per_batch = torch.zeros(BATCH_COUNT)
4355
total_time_per_batch = torch.zeros(BATCH_COUNT)
4456

45-
device = DEVICE_CUDA if cuda else DEVICE_CPU
57+
device = get_device()
58+
# model.to(device)
4659
print("Working on device: {}".format(device))
47-
model.to(device)
60+
4861

4962
for batch_idx in range(BATCH_COUNT):
50-
print("Starting BATCHs {} of {}".format(batch_idx + 1, BATCH_COUNT))
51-
(output, load_time, train_time), batch_time = measure_runtime(run_batch_inference,
63+
print("Starting BATCH {} of {}".format(batch_idx + 1, BATCH_COUNT))
64+
(output, load_time, inference_time), batch_time = measure_runtime(run_batch_inference,
5265
dataloader,
53-
model,
54-
cuda)
66+
model)
5567
load_time_per_batch[batch_idx] = load_time
56-
inference_time_per_batch[batch_idx] = train_time
68+
inference_time_per_batch[batch_idx] = inference_time
5769
total_time_per_batch[batch_idx] = batch_time
5870

5971
print("Finished Batch {} of {}".format(batch_idx + 1, BATCH_COUNT))
6072
print("Batch load time: {}".format(load_time))
61-
print("Batch inference time: {}".format(train_time))
73+
print("Batch inference time: {}".format(inference_time))
6274
print("Batch total time: {}".format(batch_time))
6375
return model, load_time_per_batch, inference_time_per_batch, total_time_per_batch
6476

@@ -71,46 +83,85 @@ def measure_runtime(func, *func_args):
7183
return result, elapsed
7284

7385

74-
def run_batch_inference(dataloader, model, cuda=True):
75-
(x, y), load_time = measure_runtime(
86+
def run_batch_inference(dataloader, model):
87+
(question, answer), load_time = measure_runtime(
7688
__get_next_batch, dataloader)
7789

78-
if cuda:
79-
x = x.to(DEVICE_CUDA)
80-
y = y.to(DEVICE_CUDA)
81-
82-
output, train_time = measure_runtime(
90+
91+
# print("question: ", question, "\nanswer: ", answer)
92+
# print("question type: ", type(question), "answer type", type(answer))
93+
# print("question shape: ", len(question), "answer shape", len(answer))
94+
# device = get_device()
95+
# x = x.to(device)
96+
# y = y.to(device)
97+
98+
output, inference_time = measure_runtime(
99+
inference,
83100
model,
84-
x)
101+
[question])
85102

86-
return output, load_time, train_time
103+
return output, load_time, inference_time
104+
105+
def inference(
106+
generator: Llama,
107+
prompts: List[str],
108+
temperature: float = 0.6,
109+
top_p: float = 0.9,
110+
max_gen_len: int = 64,
111+
):
112+
with torch.no_grad():
113+
results = generator.text_completion(
114+
prompts,
115+
max_gen_len=max_gen_len,
116+
temperature=temperature,
117+
top_p=top_p,
118+
)
119+
return zip(prompts, results)
87120

88121
def __get_next_batch(dataloader):
89122
return next(iter(dataloader))
90123

91124

92-
def benchmark():
125+
def benchmark(ckpt_dir,
126+
tokenizer_path,
127+
max_seq_len,
128+
max_batch_size):
93129
print("Starting up...")
94130

95131
print("Building data loaders...")
96132
data_loader = get_data_loader()
97133

98134
print("Initializing Model...")
99-
net = get_model()
135+
net = get_model(ckpt_dir, tokenizer_path, max_seq_len, max_batch_size)
100136

101137
print("Running inference benchmark...\n")
102-
_, load, inference, total = run_batch_inference(data_loader, net)
103-
104-
print("Results...")
105-
print("C2.1: Data-loading times")
138+
139+
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, profile_memory=PROFILE_MEMORY) as prof:
140+
# with record_function("run_benchmark"):
141+
# _, load, inference, total = run_benchmark(data_loader, net)
142+
_, load, inference, total = run_benchmark(data_loader, net)
143+
144+
print("\n\n Manual Profile Results...")
145+
print("Data-loading times")
106146
print("> per epoch: ", load)
107147
print("> average: ", torch.mean(load))
108-
print("C2.2: Training time for each epoch")
148+
print("\nInference time for each epoch")
109149
print("> per epoch", inference)
110150
print("> average", torch.mean(inference))
111-
print("C2.3: Total time for each epoch")
151+
print("\nTotal time for each epoch")
112152
print("> per epoch", total)
113153
print("> average", torch.mean(total))
114154

155+
print("\n\n")
156+
print("Profiling sorted by CUDA time total")
157+
profile_cuda_time = prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)
158+
print(profile_cuda_time)
159+
160+
print("\n\n")
161+
print("Profiling sorted by CUDA memory usage")
162+
profile_cuda_mem = prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10)
163+
print(profile_cuda_mem)
164+
165+
115166
if __name__ == "__main__":
116-
benchmark()
167+
fire.Fire(benchmark)

0 commit comments

Comments
 (0)