[llm] Fix start_pos not being updated in prefill_chunk()

larryliu0820 · larryliu0820 · commit 0517afcc8279 · 2025-06-11T19:23:13.000-07:00
diff --git a/extension/llm/runner/test/test_text_prefiller.cpp b/extension/llm/runner/test/test_text_prefiller.cpp
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ * @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
+ */
+
+#include <executorch/extension/llm/runner/text_decoder_runner.h>
+#include <executorch/extension/llm/runner/text_prefiller.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using executorch::extension::llm::TextDecoderRunner;
+using executorch::extension::llm::TextPrefiller;
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+using executorch::runtime::testing::TensorFactory;
+
+// Mock class for TextDecoderRunner
+class MockTextDecoderRunner : public TextDecoderRunner {
+ public:
+  MockTextDecoderRunner() : TextDecoderRunner(nullptr, false) {}
+  MOCK_METHOD(
+      Result<executorch::aten::Tensor>,
+      step,
+      (executorch::extension::TensorPtr&, executorch::extension::TensorPtr&),
+      ());
+  MOCK_METHOD(bool, is_method_loaded, (), ());
+  MOCK_METHOD(Result<uint64_t>, prefill, (std::vector<uint64_t>&, int64_t), ());
+  MOCK_METHOD(::executorch::runtime::Error, load, (), ());
+  MOCK_METHOD(uint64_t, logits_to_token, (const executorch::aten::Tensor&), ());
+};
+
+// Test fixture for TextPrefiller tests
+class TextPrefillerTest : public Test {
+ protected:
+  void SetUp() override {
+    // Set up default behavior for the text decoder runner
+    ON_CALL(text_decoder_runner_, is_method_loaded())
+        .WillByDefault(Return(true));
+    ON_CALL(text_decoder_runner_, step)
+        .WillByDefault([&](executorch::extension::TensorPtr&,
+                           executorch::extension::TensorPtr&) {
+          return Result<executorch::aten::Tensor>(tensor);
+        });
+    ON_CALL(text_decoder_runner_, logits_to_token)
+        .WillByDefault([](const executorch::aten::Tensor&) { return 42; });
+  }
+
+  // Helper method to create a TextPrefiller with specific parameters
+  std::unique_ptr<TextPrefiller> createTextPrefiller(
+      int64_t max_seq_len,
+      bool use_kv_cache = true,
+      bool enable_parallel_prefill = false) {
+    return std::make_unique<TextPrefiller>(
+        &text_decoder_runner_,
+        use_kv_cache,
+        enable_parallel_prefill,
+        max_seq_len);
+  }
+
+  // Create a mock TextPrefiller that allows us to spy on prefill_chunk calls
+  class SpyTextPrefiller : public TextPrefiller {
+   public:
+    SpyTextPrefiller(
+        TextDecoderRunner* text_decoder_runner,
+        bool use_kv_cache,
+        bool enable_parallel_prefill,
+        int64_t max_seq_len)
+        : TextPrefiller(
+              text_decoder_runner,
+              use_kv_cache,
+              enable_parallel_prefill,
+              max_seq_len) {}
+
+    MOCK_METHOD(
+        ::executorch::runtime::Result<uint64_t>,
+        prefill_chunk,
+        (std::vector<uint64_t>&, int64_t&),
+        ());
+
+    // Call the real implementation after recording the call
+    ::executorch::runtime::Result<uint64_t> prefill_chunk_impl(
+        std::vector<uint64_t>& prompt_tokens,
+        int64_t& start_pos) {
+      return TextPrefiller::prefill_chunk(prompt_tokens, start_pos);
+    }
+  };
+
+  // Create a spy TextPrefiller
+  std::unique_ptr<SpyTextPrefiller> createSpyTextPrefiller(
+      int64_t max_seq_len,
+      bool use_kv_cache = true,
+      bool enable_parallel_prefill = false) {
+    auto prefiller = std::make_unique<SpyTextPrefiller>(
+        &text_decoder_runner_,
+        use_kv_cache,
+        enable_parallel_prefill,
+        max_seq_len);
+
+    // Set up the spy to call the real implementation
+    ON_CALL(*prefiller, prefill_chunk)
+        .WillByDefault(
+            [prefiller](std::vector<uint64_t>& tokens, int64_t& pos) {
+              return prefiller->prefill_chunk_impl(tokens, pos);
+            });
+
+    return prefiller;
+  }
+
+  MockTextDecoderRunner text_decoder_runner_;
+  std::vector<float> return_logits_ = {0.1f, 0.2f, 0.3f, 0.4f};
+  TensorFactory<executorch::aten::ScalarType::Float> tf;
+  executorch::aten::Tensor tensor = tf.make({1, 4}, return_logits_);
+};
+
+// Test that prefill() calls prefill_chunk() once when prompt tokens <=
+// max_seq_len
+TEST_F(TextPrefillerTest, PrefillCallsPrefillChunkOnceWhenPromptFits) {
+  // Create a spy TextPrefiller with max_seq_len = 10
+  auto prefiller = createSpyTextPrefiller(10);
+
+  // Create prompt tokens with size <= max_seq_len
+  std::vector<uint64_t> prompt_tokens = {1, 2, 3, 4, 5};
+  int64_t start_pos = 0;
+
+  // Expect prefill_chunk to be called exactly once with the entire prompt
+  EXPECT_CALL(*prefiller, prefill_chunk(_, _))
+      .Times(1)
+      .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos) {
+        // Verify the tokens passed to prefill_chunk
+        EXPECT_EQ(tokens.size(), prompt_tokens.size());
+        for (size_t i = 0; i < tokens.size(); i++) {
+          EXPECT_EQ(tokens[i], prompt_tokens[i]);
+        }
+        // Verify the position
+        EXPECT_EQ(pos, start_pos);
+        return Result<uint64_t>(42);
+      });
+
+  // Call prefill
+  auto result = prefiller->prefill(prompt_tokens, start_pos);
+
+  // Verify the result
+  EXPECT_EQ(result.error(), Error::Ok);
+  EXPECT_EQ(result.get(), 42);
+}
+
+// Test that prefill() calls prefill_chunk() multiple times when prompt tokens >
+// max_seq_len
+TEST_F(
+    TextPrefillerTest,
+    PrefillCallsPrefillChunkMultipleTimesWhenPromptExceedsMaxLen) {
+  // Create a spy TextPrefiller with max_seq_len = 3
+  const int64_t max_seq_len = 3;
+  auto prefiller = createSpyTextPrefiller(max_seq_len);
+
+  // Create prompt tokens with size > max_seq_len
+  std::vector<uint64_t> prompt_tokens = {1, 2, 3, 4, 5, 6, 7, 8};
+  int64_t start_pos = 0;
+
+  // Calculate expected number of chunks
+  const int expected_chunks =
+      (prompt_tokens.size() + max_seq_len - 1) / max_seq_len;
+
+  // Set up expectations for prefill_chunk calls
+  {
+    InSequence seq; // Ensure calls happen in the expected order
+
+    // First chunk: tokens [1, 2, 3]
+    EXPECT_CALL(*prefiller, prefill_chunk(_, _))
+        .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos) {
+          EXPECT_EQ(tokens.size(), 3);
+          EXPECT_EQ(tokens[0], 1);
+          EXPECT_EQ(tokens[1], 2);
+          EXPECT_EQ(tokens[2], 3);
+          EXPECT_EQ(pos, 0);
+          return Result<uint64_t>(10);
+        });
+
+    // Second chunk: tokens [4, 5, 6]
+    EXPECT_CALL(*prefiller, prefill_chunk(_, _))
+        .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos) {
+          EXPECT_EQ(tokens.size(), 3);
+          EXPECT_EQ(tokens[0], 4);
+          EXPECT_EQ(tokens[1], 5);
+          EXPECT_EQ(tokens[2], 6);
+          EXPECT_EQ(pos, 3);
+          return Result<uint64_t>(20);
+        });
+
+    // Third chunk: tokens [7, 8]
+    EXPECT_CALL(*prefiller, prefill_chunk(_, _))
+        .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos) {
+          EXPECT_EQ(tokens.size(), 2);
+          EXPECT_EQ(tokens[0], 7);
+          EXPECT_EQ(tokens[1], 8);
+          EXPECT_EQ(pos, 6);
+          return Result<uint64_t>(30);
+        });
+  }
+
+  // Call prefill
+  auto result = prefiller->prefill(prompt_tokens, start_pos);
+
+  // Verify the result
+  EXPECT_EQ(result.error(), Error::Ok);
+  EXPECT_EQ(result.get(), 30); // Should return the token from the last chunk
+
+  // Verify that start_pos has been updated correctly
+  EXPECT_EQ(start_pos, prompt_tokens.size());
+}
+
+// Test that prefill() handles edge cases correctly
+TEST_F(TextPrefillerTest, PrefillHandlesEdgeCasesCorrectly) {
+  // Create a spy TextPrefiller with max_seq_len = 1
+  const int64_t max_seq_len = 1;
+  auto prefiller = createSpyTextPrefiller(max_seq_len);
+
+  // Create prompt tokens with size > max_seq_len
+  std::vector<uint64_t> prompt_tokens = {1, 2, 3};
+  int64_t start_pos = 5; // Non-zero starting position
+
+  // Set up expectations for prefill_chunk calls
+  {
+    InSequence seq;
+
+    // First chunk: token [1]
+    EXPECT_CALL(*prefiller, prefill_chunk(_, _))
+        .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos) {
+          EXPECT_EQ(tokens.size(), 1);
+          EXPECT_EQ(tokens[0], 1);
+          EXPECT_EQ(pos, 5);
+          return Result<uint64_t>(10);
+        });
+
+    // Second chunk: token [2]
+    EXPECT_CALL(*prefiller, prefill_chunk(_, _))
+        .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos) {
+          EXPECT_EQ(tokens.size(), 1);
+          EXPECT_EQ(tokens[0], 2);
+          EXPECT_EQ(pos, 6);
+          return Result<uint64_t>(20);
+        });
+
+    // Third chunk: token [3]
+    EXPECT_CALL(*prefiller, prefill_chunk(_, _))
+        .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos) {
+          EXPECT_EQ(tokens.size(), 1);
+          EXPECT_EQ(tokens[0], 3);
+          EXPECT_EQ(pos, 7);
+          return Result<uint64_t>(30);
+        });
+  }
+
+  // Call prefill
+  auto result = prefiller->prefill(prompt_tokens, start_pos);
+
+  // Verify the result
+  EXPECT_EQ(result.error(), Error::Ok);
+  EXPECT_EQ(result.get(), 30);
+
+  // Verify that start_pos has been updated correctly
+  EXPECT_EQ(start_pos, 8); // 5 (initial) + 3 (tokens)
+}
+
+// Test that prefill() handles errors from prefill_chunk correctly
+TEST_F(TextPrefillerTest, PrefillHandlesPrefillChunkErrorsCorrectly) {
+  // Create a spy TextPrefiller with max_seq_len = 3
+  const int64_t max_seq_len = 3;
+  auto prefiller = createSpyTextPrefiller(max_seq_len);
+
+  // Create prompt tokens with size > max_seq_len
+  std::vector<uint64_t> prompt_tokens = {1, 2, 3, 4, 5};
+  int64_t start_pos = 0;
+
+  // Set up expectations for prefill_chunk calls
+  {
+    InSequence seq;
+
+    // First chunk: tokens [1, 2, 3] - succeeds
+    EXPECT_CALL(*prefiller, prefill_chunk(_, _))
+        .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos) {
+          return Result<uint64_t>(10);
+        });
+
+    // Second chunk: tokens [4, 5] - fails
+    EXPECT_CALL(*prefiller, prefill_chunk(_, _))
+        .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos) {
+          return Result<uint64_t>(Error::InvalidArgument);
+        });
+  }
+
+  // Call prefill
+  auto result = prefiller->prefill(prompt_tokens, start_pos);
+
+  // Verify that the error is propagated
+  EXPECT_EQ(result.error(), Error::InvalidArgument);
+}
+
+// Test that prefill_chunk() works correctly with parallel prefill enabled
+TEST_F(TextPrefillerTest, PrefillChunkWorksWithParallelPrefill) {
+  // Create a TextPrefiller with parallel prefill enabled
+  auto prefiller = createTextPrefiller(10, true, true);
+
+  // Set up expectations for the text decoder runner
+  EXPECT_CALL(text_decoder_runner_, step(_, _))
+      .Times(1)
+      .WillOnce(Return(Result<executorch::aten::Tensor>(tensor)));
+
+  EXPECT_CALL(text_decoder_runner_, logits_to_token(_))
+      .Times(1)
+      .WillOnce(Return(42));
+
+  // Create prompt tokens
+  std::vector<uint64_t> prompt_tokens = {1, 2, 3};
+  int64_t start_pos = 0;
+
+  // Call prefill
+  auto result = prefiller->prefill(prompt_tokens, start_pos);
+
+  // Verify the result
+  EXPECT_EQ(result.error(), Error::Ok);
+  EXPECT_EQ(result.get(), 42);
+
+  // Verify that start_pos has been updated correctly
+  EXPECT_EQ(start_pos, prompt_tokens.size());
+}
+
+// Test that prefill_chunk() works correctly with sequential prefill
+TEST_F(TextPrefillerTest, PrefillChunkWorksWithSequentialPrefill) {
+  // Create a TextPrefiller with sequential prefill (parallel prefill disabled)
+  auto prefiller = createTextPrefiller(10, true, false);
+
+  // Set up expectations for the text decoder runner
+  EXPECT_CALL(text_decoder_runner_, step(_, _))
+      .Times(3) // Once for each token
+      .WillRepeatedly(Return(Result<executorch::aten::Tensor>(tensor)));
+
+  EXPECT_CALL(text_decoder_runner_, logits_to_token(_))
+      .Times(1) // Only called once at the end
+      .WillOnce(Return(42));
+
+  // Create prompt tokens
+  std::vector<uint64_t> prompt_tokens = {1, 2, 3};
+  int64_t start_pos = 0;
+
+  // Call prefill
+  auto result = prefiller->prefill(prompt_tokens, start_pos);
+
+  // Verify the result
+  EXPECT_EQ(result.error(), Error::Ok);
+  EXPECT_EQ(result.get(), 42);
+
+  // Verify that start_pos has been updated correctly
+  EXPECT_EQ(start_pos, prompt_tokens.size());
+}
diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp
@@ -56,21 +56,22 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
           prompt_tokens_to_process.begin());
 
       // Process this chunk
-      auto chunk_result = prefillChunk(prompt_tokens_to_process, start_pos);
+      auto chunk_result = prefill_chunk(prompt_tokens_to_process, start_pos);
       ET_CHECK_OK_OR_RETURN_ERROR(chunk_result.error());
       cur_token = chunk_result.get();
 
+      start_pos += num_tokens_to_prefill_with;
       num_tokens_to_process += num_tokens_to_prefill_with;
     }
 
     return cur_token;
   } else {
     // If prompt tokens don't exceed max_seq_len_, process them directly
-    return prefillChunk(prompt_tokens, start_pos);
+    return prefill_chunk(prompt_tokens, start_pos);
   }
 }
 
-::executorch::runtime::Result<uint64_t> TextPrefiller::prefillChunk(
+::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(
     std::vector<uint64_t>& prompt_tokens,
     int64_t& start_pos) {
   // enable_parallel_prefill_ maybe set even when not using kv cache
diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h
@@ -45,7 +45,7 @@ class ET_EXPERIMENTAL TextPrefiller {
    * Module.
    * @return The next token of the LLM Module after prefilling this chunk.
    */
-  ::executorch::runtime::Result<uint64_t> prefillChunk(
+  ::executorch::runtime::Result<uint64_t> prefill_chunk(
       std::vector<uint64_t>& prompt_tokens,
       int64_t& start_pos);
 
diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers
@@ -1 +1 @@
-Subproject commit fc32028858020c4fcafe37aaaeaf5d1b480336a2
+Subproject commit a2bac3b7a54a3816d66b8fd3e705f8724d2d5f2f