Add run_offline; also random weights

qihqi · qihqi · commit 674ea81ae6eb · 2024-09-11T00:21:51.000Z
diff --git a/.github/workflows/offline_perf.yaml b/.github/workflows/offline_perf.yaml
@@ -46,10 +46,12 @@ jobs:
     - name: Run offlinebench
       env: 
         JAX_PLATFORMS: tpu,cpu
+        HF_TOKEN : ${{ secrets.HF_TOKEN}}
       run: |
         set -euo pipefail
         source venv/bin/activate 
-        python benchmarks/basic_ops.py | ./jq-linux-amd64 -Rsa . | tee output.txt
+        JAX_PLATFORMS=tpu,cpu python -m jetstream_pt.cli benchmark_offline --model_id meta-llama/Meta-Llama-3-8B-Instruct --quantize_weights=0 --override_batch_size=128 --benchmark_save_offline_result_to_file=result.md --internal_use_random_weights=True --hf_token=$HF_TOKEN
+        cat result.md | ./jq-linux-amd64 > output.txt
     - name: Update result to PR
       env: 
         URL: ${{ github.event.pull_request.comments_url }}
diff --git a/benchmarks/run_offline.py b/benchmarks/run_offline.py
@@ -92,7 +92,8 @@ def main(argv):
 
   decode_state = engine.init_decode_state()
   profiler_started = False
-  for batch, _ in MAXTEXT_PREFILL.items():
+  for exp in range(4, 11):
+    batch = 2 ** exp
     runtime, decode_state, profiler_started = run_prefill_time(
         engine, params, decode_state, batch, profiler_started
     )
diff --git a/jetstream_pt/cli.py b/jetstream_pt/cli.py
@@ -5,6 +5,7 @@
 # import torch_xla2 first!
 import torch_xla2  # pylint: disable
 import jax
+from jax import numpy as jnp
 from absl import app, flags
 from jetstream.engine import token_utils
 from jetstream.core import server_lib
@@ -26,6 +27,7 @@
 flags.DEFINE_integer("max_output_length", 1024, "The batch size")
 flags.DEFINE_integer("port", 9000, "port to listen on")
 flags.DEFINE_integer("threads", 64, "number of worker threads in thread pool")
+flags.DEFINE_string("benchmark_save_offline_result_to_file", "", "if set, then save the result to the given file name")
 
 
 def shard_weights(env, weights, weight_shardings):
@@ -113,6 +115,42 @@ def _check_model_id():
     list_model()
     sys.exit(1)
 
+def _run_prefill_time(engine, params, decode_state, seqlen, profiler_started):
+  """Run prefill and measure time."""
+  metadata = engine.get_tokenizer()
+  tokenizer = engine.build_tokenizer(metadata)
+
+  text = "This is a beautiful day"
+  tokens, true_length = tokenizer.encode(
+      text, is_bos=True, prefill_lengths=[seqlen]
+  )
+
+  for _ in range(3):
+    prefill_result, _ = engine.prefill(
+        params=params, padded_tokens=tokens, true_length=true_length
+    )
+    decode_state = engine.insert(
+        prefill_result, decode_state, slot=jnp.int32(1)
+    )
+
+  nums = 5
+  start = time.perf_counter()
+  for i in range(nums):
+    if i == nums - 1 and FLAGS.profiling_prefill and not profiler_started:
+      jax.profiler.start_trace(FLAGS.profiling_output)
+      profiler_started = True
+
+    prefill_result, _ = engine.prefill(
+        params=params, padded_tokens=tokens, true_length=true_length
+    )
+    decode_state = engine.insert(
+        prefill_result, decode_state, slot=jnp.int32(i)
+    )
+  jax.block_until_ready(decode_state)
+
+  end = time.perf_counter()
+  return (end - start) / nums, decode_state, profiler_started
+
 
 def interactive():
   """Run interactive"""
@@ -206,6 +244,101 @@ def interactive():
     print("---- All output text.")
     print(tokenizer.decode(sampled_tokens_list))
 
+def _save_benchmark_to_file(filename, prefill_times_ms, decode_time_ms):
+  lines = [
+    " # Offline benchmark numbers",
+    " ## Model: " + FLAGS.model_id,
+    " ## Batch size: {}".format(FLAGS.override_batch_size),
+    " ## Quantize: {}".format(FLAGS.quantize_weights),
+    " |       | time (ms) |",
+    " |-------|-----------|",
+  ] + [
+    "| Prefill {} | {} |".format(x, y) for x, y in prefill_times_ms.items()
+  ] + [
+    "| Decode | {} |".format(decode_time_ms)
+  ]
+  with open(filename, 'w') as f:
+    f.write('\n'.join(lines))
+    f.flush()
+
+
+
+def benchmark_offline():
+  """function to run engine offline."""
+  _check_model_id()
+  devices = server_lib.get_devices()
+  print(f"devices: {devices}")
+  pt_engine = create_engine(devices)
+
+  start = time.perf_counter()
+  params = pt_engine.load_params()
+  print("Load params ", time.perf_counter() - start)
+
+  prefill_times = {}
+
+  decode_state = pt_engine.init_decode_state()
+  profiler_started = False
+  # 16 .. 1024
+  for exp in range(4, 11):
+    batch = 2 ** exp
+    runtime, decode_state, profiler_started = _run_prefill_time(
+        pt_engine, params, decode_state, batch, profiler_started
+    )
+    prefill_times[batch] = runtime
+
+  sampled_tokens_list = []
+
+  for i in range(3):  # warm up
+    # pylint: disable-next=all
+    decode_state, sampled_tokens = pt_engine.generate(
+        params=params, decode_state=decode_state
+    )
+    sampled_tokens_list.append(sampled_tokens)
+
+  profiling_output = FLAGS.profiling_output
+  print("======= decode starting ===")
+
+  dec_times = []
+  for i in range(10):
+    if profiling_output and i == 7 and not profiler_started:
+      jax.profiler.start_trace(profiling_output)
+      profiler_started = True
+    start = time.perf_counter()
+    # pylint: disable-next=all
+    decode_state, sampled_tokens = pt_engine.generate(params, decode_state)
+    jax.block_until_ready(decode_state)
+    sampled_tokens_list.append(sampled_tokens)
+    end = time.perf_counter()
+    dec_times.append(end - start)
+    print(i, "decode time", (end - start))
+
+  if profiler_started:
+    jax.profiler.stop_trace()
+
+  print("prefill ", prefill_times)
+  avg_decode_times = sum(dec_times[2:]) / len(dec_times[2:])
+  print("decode", avg_decode_times)
+
+  prefill_times_ms = {k: v * 1000 for k, v in prefill_times.items()}
+  decode_time_ms = sum(dec_times[2:]) * 1000 / 8
+
+  largest_prefill = max(prefill_times.items())
+  print("MAX tokens:", FLAGS.batch_size / avg_decode_times)
+
+  time2 = (FLAGS.batch_size * FLAGS.max_decode_length) / (
+      FLAGS.batch_size * largest_prefill[1]
+      + FLAGS.max_decode_length * avg_decode_times
+  )
+  print("MAX tokens 2:", time2)
+
+  if FLAGS.benchmark_save_offline_result_to_file:
+    _save_benchmark_to_file(
+      FLAGS.benchmark_save_offline_result_to_file,
+      prefill_times_ms,
+      decode_time_ms
+    )
+
+
 
 def main():
   """Main function."""
@@ -221,6 +354,8 @@ def main_real(argv):
       serve()
     elif argv[1] == "interactive":
       interactive()
+    elif argv[1] == "benchmark_offline":
+      benchmark_offline()
     else:
       print(
           "Invalid arguments. please specify 'list', 'serve', or 'interactive'."
diff --git a/jetstream_pt/fetch_models.py b/jetstream_pt/fetch_models.py
@@ -23,6 +23,7 @@
     "Directory to store downloaded/converted weights",
 )
 flags.DEFINE_string("hf_token", "", "huggingface token")
+flags.DEFINE_bool("internal_use_random_weights", False, "Use random weights instead of HF weights. Testing only.")
 
 flags.DEFINE_integer(
     "override_max_cache_length",
@@ -157,24 +158,34 @@ def _load_weights(directory):
   # Load the state_dict into the model
   return state_dict
 
+def _make_random_model_weights(model):
+  result = {}
+  for key, val in model.state_dict().items():
+    new_weights = torch.rand(val.shape, dtype=val.dtype, device='cpu')
+    result[key] = new_weights
+  return result
+
 
 def instantiate_model_from_repo_id(
     repo_id,
     env,
 ):
   """Create model instance by hf model id.+"""
   model_dir = _hf_dir(repo_id)
-  if not os.path.exists(model_dir) or not os.listdir(model_dir):
+  if not FLAGS.internal_use_random_weights and (not os.path.exists(model_dir) or 
+      not os.listdir(model_dir)):
     # no weights has been downloaded
     _hf_download(repo_id, model_dir, FLAGS.hf_token)
   model_info = model_id_to_class.get(repo_id)
   assert model_info is not None
 
   env.device = "meta"
   model = model_info.model_class.from_hf_model_id(repo_id, env)
-  weights = _load_weights(model_dir)
-  weights = model.convert_hf_weights(weights)
-
+  if not FLAGS.internal_use_random_weights:
+    weights = _load_weights(model_dir)
+    weights = model.convert_hf_weights(weights)
+  else:
+    weights = _make_random_model_weights(model)
   model.load_state_dict(weights, assign=True, strict=False)
 
   return model

Original file line number	Diff line number	Diff line change
`@@ -92,7 +92,8 @@ def main(argv):`
`92`	`92`
`93`	`93`	`decode_state = engine.init_decode_state()`
`94`	`94`	`profiler_started = False`
`95`		`- for batch, _ in MAXTEXT_PREFILL.items():`
	`95`	`+ for exp in range(4, 11):`
	`96`	`+ batch = 2 ** exp`
`96`	`97`	`runtime, decode_state, profiler_started = run_prefill_time(`
`97`	`98`	`engine, params, decode_state, batch, profiler_started`
`98`	`99`	`)`