Add support for ptd in runner (#8957)

pytorchbot · lucylq · web-flow · commit 95f779ae2120 · 2025-03-05T09:31:17.000-08:00
partner engineers are calling ET via LlamaModule: https://github.com/pytorch/executorch/blob/main/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java This is a wrapper around the runner: https://www.internalfb.com/code/fbsource/[90d251fc01a84871b679406d6dc855eb5ded82fd]/fbcode/executorch/examples/models/llama/runner/runner.cpp?lines=47 Differential Revision: [D70596210](https://our.internmc.facebook.com/intern/diff/D70596210/) ghstack-source-id: 269741205 Pull Request resolved: #8953 Co-authored-by: lucylq <lfq@meta.com>
diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp
@@ -39,19 +39,25 @@ static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
 Runner::Runner(
     const std::string& model_path,
     const std::string& tokenizer_path,
-    const float temperature)
+    const float temperature,
+    std::optional<const std::string> data_path)
     // NOTE: we observed ~2x loading performance increase on iPhone 15
     // and a ~5% improvement on Galaxy S22 by switching to
     // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
     : temperature_(temperature),
-      module_(std::make_unique<Module>(model_path, Module::LoadMode::File)),
       tokenizer_path_(tokenizer_path),
       metadata_({
           {kEnableDynamicShape, false},
           {kMaxSeqLen, 128},
           {kUseKVCache, true},
           {kUseSDPAWithKVCache, false},
       }) {
+  if (data_path.has_value()) {
+    module_ = std::make_unique<Module>(
+        model_path, data_path.value(), Module::LoadMode::File);
+  } else {
+    module_ = std::make_unique<Module>(model_path, Module::LoadMode::File);
+  }
   ET_LOG(
       Info,
       "Creating LLaMa runner: model_path=%s, tokenizer_path=%s",
diff --git a/examples/models/llama/runner/runner.h b/examples/models/llama/runner/runner.h
@@ -14,6 +14,7 @@
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <unordered_map>
 
@@ -32,7 +33,8 @@ class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner {
   explicit Runner(
       const std::string& model_path,
       const std::string& tokenizer_path,
-      const float temperature = 0.8f);
+      const float temperature = 0.8f,
+      std::optional<const std::string> data_path = std::nullopt);
 
   bool is_loaded() const;
   ::executorch::runtime::Error load();