18/N NPU executor: Further cleanup

MatthiasKreileder · copybara-github · commit 19154afd8e8f · 2025-06-24T09:52:38.000-07:00
LiteRT-LM-PiperOrigin-RevId: 775276323
diff --git a/runtime/core/engine_impl.cc b/runtime/core/engine_impl.cc
@@ -119,7 +119,7 @@ class EngineImpl : public Engine {
 
       std::filesystem::path path(model_path);
       ABSL_CHECK(std::filesystem::exists(path));
-      auto executor = odml::infra::LlmLiteRtNpuCompiledModelExecutor::Create(
+      auto executor = LlmLiteRtNpuCompiledModelExecutor::Create(
           engine_settings_.GetMainExecutorSettings(), *litert_model_resources_,
           path.parent_path().string());
       ABSL_CHECK_OK(executor);
diff --git a/runtime/engine/litert_lm_npu_main.cc b/runtime/engine/litert_lm_npu_main.cc
@@ -42,7 +42,7 @@ ABSL_FLAG(std::string, litert_dispatch_lib_path, "",
 ABSL_FLAG(std::string, prompt, "", "Prompt to run.");
 ABSL_FLAG(int, num_runs, 1, "Number of times to run the benchmark.");
 
-using odml::infra::LlmLiteRtNpuCompiledModelExecutor;
+using litert::lm::LlmLiteRtNpuCompiledModelExecutor;
 
 using litert::lm::InputText;
 using litert::lm::ThreadOptions;
@@ -209,7 +209,7 @@ RunStats CreateAndRun(const std::string& prompt) {
   auto executor_settings = litert::lm::LlmExecutorSettings::CreateDefault(
                                model_assets, litert::lm::Backend::NPU)
                                .value();
-  auto executor = odml::infra::LlmLiteRtNpuCompiledModelExecutor::Create(
+  auto executor = LlmLiteRtNpuCompiledModelExecutor::Create(
       executor_settings, **model_resources,
       absl::GetFlag(FLAGS_litert_dispatch_lib_path));
   auto end = absl::Now();
diff --git a/runtime/executor/llm_litert_npu_compiled_model_executor.cc b/runtime/executor/llm_litert_npu_compiled_model_executor.cc
@@ -1,6 +1,5 @@
 #include "runtime/executor/llm_litert_npu_compiled_model_executor.h"
 
-#include <cmath>
 #include <cstdint>
 #include <cstring>
 #include <limits>
@@ -22,35 +21,26 @@
 #include "absl/time/time.h"  // from @com_google_absl
 #include "absl/types/span.h"  // from @com_google_absl
 #include "litert/c/litert_common.h"  // from @litert
-#include "litert/c/litert_model.h"  // from @litert
 #include "litert/cc/litert_compiled_model.h"  // from @litert
 #include "litert/cc/litert_environment.h"  // from @litert
 #include "litert/cc/litert_layout.h"  // from @litert
 #include "litert/cc/litert_macros.h"  // from @litert
 #include "litert/cc/litert_model.h"  // from @litert
 #include "litert/cc/litert_tensor_buffer.h"  // from @litert
 #include "runtime/components/model_resources.h"
-#include "runtime/executor/executor_settings_base.h"
 #include "runtime/executor/litert_compiled_model_executor_utils.h"
 #include "runtime/executor/llm_executor_io_types.h"
 #include "runtime/executor/llm_executor_settings.h"
 #include "runtime/util/convert_tensor_buffer.h"
 #include "runtime/util/litert_status_util.h"
 #include "runtime/util/status_macros.h"  // NOLINT
 
-namespace odml::infra {
+namespace litert::lm {
 
 namespace {
 using ::litert::CompiledModel;
 using ::litert::Environment;
-using ::litert::Model;
 using ::litert::TensorBuffer;
-using ::litert::lm::CopyFromTensorBuffer;
-using ::litert::lm::ExecutorInputs;
-using ::litert::lm::ExecutorPrefillParams;
-using ::litert::lm::GetOptimizedPrefillWorkGroups;
-using ::litert::lm::ReferTensorBufferAsSpan;
-using ::litert::lm::SortedPrefillSignatureMap;
 
 constexpr char kPrefillSignature[] = "prefill_128";
 constexpr int kPrefillSize = 128;
@@ -538,8 +528,8 @@ absl::Status LlmLiteRtNpuCompiledModelExecutor::Decode(
   LITERT_ASSIGN_OR_RETURN(auto logits_buffer_int16,
                           CopyFromTensorBuffer<int16_t>(decoded_logits));
   int max_index = 0;
-  int16_t max_value = logits_buffer_int16[0];
-  for (int i = 1; i < logits_buffer_int16.size(); ++i) {
+  int16_t max_value = std::numeric_limits<int16_t>::min();
+  for (int i = 0; i < logits_buffer_int16.size(); ++i) {
     if (logits_buffer_int16[i] > max_value) {
       max_value = logits_buffer_int16[i];
       max_index = i;
@@ -1015,4 +1005,4 @@ LlmLiteRtNpuCompiledModelExecutor::Create(
   return executor;
 };
 
-}  // namespace odml::infra
+}  // namespace litert::lm
diff --git a/runtime/executor/llm_litert_npu_compiled_model_executor.h b/runtime/executor/llm_litert_npu_compiled_model_executor.h
@@ -38,7 +38,7 @@
 #include "runtime/executor/llm_executor_io_types.h"
 #include "runtime/executor/llm_executor_settings.h"
 
-namespace odml::infra {
+namespace litert::lm {
 
 // Component intended to be used with an NPU variant of Gemma3.
 class LlmLiteRtNpuCompiledModelExecutor : public ::litert::lm::LlmExecutor {
@@ -122,7 +122,7 @@ class LlmLiteRtNpuCompiledModelExecutor : public ::litert::lm::LlmExecutor {
   absl::Status Reset() override;
 
  private:
-  // Holds the tensor buffers maps for the inference of a precompiled model,
+  // Holds the tensor buffer maps for the inference of a precompiled model,
   // both for prefill and decode.
   struct InferenceContext {
     absl::flat_hash_map<absl::string_view, ::litert::TensorBuffer>
@@ -165,7 +165,8 @@ class LlmLiteRtNpuCompiledModelExecutor : public ::litert::lm::LlmExecutor {
   // signatures for Mask, RoPE and KV cache update computation.
   struct NpuAuxiliaryContext {
     ::litert::CompiledModel npu_auxiliary_compiled_model;
-    NpuAuxiliaryContext(::litert::CompiledModel npu_auxiliary_compiled_model);
+    explicit NpuAuxiliaryContext(
+        ::litert::CompiledModel npu_auxiliary_compiled_model);
   };
 
  protected:
@@ -188,9 +189,7 @@ class LlmLiteRtNpuCompiledModelExecutor : public ::litert::lm::LlmExecutor {
         llm_inference_context_(std::move(llm_inference_context)),
         cache_update_inference_context_(
             std::move(cache_update_inference_context)),
-        prefill_signature_map_(std::move(prefill_signature_map)) {
-    executor_settings_.SetMaxNumTokens(1280);
-  }
+        prefill_signature_map_(std::move(prefill_signature_map)) {}
 
  private:
   // Prefill internal implementation, for one prefill call to the Interpreter
@@ -309,6 +308,6 @@ class LlmLiteRtNpuCompiledModelExecutor : public ::litert::lm::LlmExecutor {
   int next_input_token_id_ = -1;
 };
 
-}  // namespace odml::infra
+}  // namespace litert::lm
 
 #endif  // THIRD_PARTY_ODML_LITERT_LM_RUNTIME_EXECUTOR_LITERT_NPU_COMPILED_MODEL_EXECUTOR_H_