[Spec][Ngram] 4/N: Remove max_match_window_size and min_match_window_size, matching all suffixes of the Trie (#21225)

kpham-sgl · web-flow · commit f83665807700 · 2026-04-01T22:09:46.000-07:00
diff --git a/docs/advanced_features/server_arguments.md b/docs/advanced_features/server_arguments.md
@@ -295,12 +295,10 @@ Please consult the documentation below and [server_args.py](https://github.com/s
 ## Ngram speculative decoding
 | Argument | Description | Defaults | Options |
 | --- | --- | --- | --- |
-| `--speculative-ngram-min-match-window-size` | The minimum window size for pattern matching in ngram speculative decoding. | `1` | Type: int |
-| `--speculative-ngram-max-match-window-size` | The maximum window size for pattern matching in ngram speculative decoding. | `12` | Type: int |
 | `--speculative-ngram-min-bfs-breadth` | The minimum breadth for BFS (Breadth-First Search) in ngram speculative decoding. | `1` | Type: int |
 | `--speculative-ngram-max-bfs-breadth` | The maximum breadth for BFS (Breadth-First Search) in ngram speculative decoding. | `10` | Type: int |
 | `--speculative-ngram-match-type` | Ngram tree-building mode. `BFS` selects recency-based expansion and `PROB` selects frequency-based expansion. This setting is forwarded to the ngram cache implementation. | `BFS` | `BFS`, `PROB` |
-| `--speculative-ngram-max-trie-depth` | The max trie depth for ngram speculative decoding. | `18` | Type: int |
+| `--speculative-ngram-max-trie-depth` | Maximum suffix length stored and matched by the ngram trie. | `18` | Type: int |
 | `--speculative-ngram-capacity` | The cache capacity for ngram speculative decoding. | `10000000` | Type: int |
 
 ## Multi-layer Eagle speculative decoding
diff --git a/docs/advanced_features/speculative_decoding.md b/docs/advanced_features/speculative_decoding.md
@@ -387,13 +387,11 @@ Enable it with:
 
 | Parameter | Description | Default |
 |---|---|---|
-| `--speculative-num-draft-tokens` | Number of draft tokens verified per step. If omitted, defaults to `--speculative-ngram-max-match-window-size`. | `12` (with default ngram settings) |
-| `--speculative-ngram-min-match-window-size` | Minimum matching window size. | `1` |
-| `--speculative-ngram-max-match-window-size` | Maximum matching window size. | `12` |
+| `--speculative-num-draft-tokens` | Number of draft tokens verified per step. If omitted, defaults to `min(--speculative-ngram-max-trie-depth, 12)`. | `12` (with default ngram settings) |
 | `--speculative-ngram-min-bfs-breadth` | Minimum BFS breadth. | `1` |
 | `--speculative-ngram-max-bfs-breadth` | Maximum BFS breadth. | `10` |
 | `--speculative-ngram-match-type` | Ngram tree-building mode: `"BFS"` for recency-based expansion or `"PROB"` for frequency-based expansion. | `"BFS"` |
-| `--speculative-ngram-max-trie-depth` | The max trie depth for ngram speculative decoding. | `18` |
+| `--speculative-ngram-max-trie-depth` | Maximum suffix length stored and matched by the ngram trie. | `18` |
 | `--speculative-ngram-capacity` | Cache capacity (number of entries). | `10,000,000` |
 
 Notes:
@@ -408,7 +406,6 @@ python3 -m sglang.launch_server \
     --model Qwen/Qwen2.5-7B-Instruct \
     --speculative-algorithm NGRAM \
     --speculative-num-draft-tokens 16 \
-    --speculative-ngram-max-match-window-size 12 \
     --speculative-ngram-max-bfs-breadth 10 \
     --mem-fraction-static 0.7 \
     --cuda-graph-max-bs 8 \
@@ -464,12 +461,10 @@ Below is a comprehensive list of all speculative decoding parameters available i
 
 | Parameter | Type | Default | Description |
 |---|---|---|---|
-| `--speculative-ngram-min-match-window-size` | `int` | `1` | Minimum ngram matching window |
-| `--speculative-ngram-max-match-window-size` | `int` | `12` | Maximum ngram matching window |
 | `--speculative-ngram-min-bfs-breadth` | `int` | `1` | Minimum BFS breadth |
 | `--speculative-ngram-max-bfs-breadth` | `int` | `10` | Maximum BFS breadth |
 | `--speculative-ngram-match-type` | `str` | `"BFS"` | Ngram tree-building mode: `"BFS"` for recency-based expansion or `"PROB"` for frequency-based expansion |
-| `--speculative-ngram-max-trie-depth` | `int` | `18` | Max trie depth for ngram speculative decoding |
+| `--speculative-ngram-max-trie-depth` | `int` | `18` | Maximum suffix length stored and matched by the ngram trie |
 | `--speculative-ngram-capacity` | `int` | `10,000,000` | Cache capacity |
 
 ### Environment variables
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
@@ -506,8 +506,6 @@ class ServerArgs:
     speculative_draft_model_quantization: Optional[str] = None
 
     # Speculative decoding (ngram)
-    speculative_ngram_min_match_window_size: int = 1
-    speculative_ngram_max_match_window_size: int = 12
     speculative_ngram_min_bfs_breadth: int = 1
     speculative_ngram_max_bfs_breadth: int = 10
     speculative_ngram_match_type: Literal["BFS", "PROB"] = "BFS"
@@ -3108,8 +3106,10 @@ def _handle_speculative_decoding(self):
             self.enable_mixed_chunk = False
             self.speculative_eagle_topk = self.speculative_ngram_max_bfs_breadth
             if self.speculative_num_draft_tokens is None:
-                self.speculative_num_draft_tokens = (
-                    self.speculative_ngram_max_match_window_size
+                self.speculative_num_draft_tokens = 12
+                logger.warning(
+                    "speculative_num_draft_tokens is set to 12 by default for ngram speculative decoding. "
+                    "You can override this by explicitly setting --speculative-num-draft-tokens."
                 )
             logger.warning(
                 "The overlap scheduler and mixed chunked prefill are disabled because of "
@@ -4851,18 +4851,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
         )
 
         # Speculative decoding (ngram)
-        parser.add_argument(
-            "--speculative-ngram-min-match-window-size",
-            type=int,
-            default=ServerArgs.speculative_ngram_min_match_window_size,
-            help="The minimum window size for pattern matching in ngram speculative decoding.",
-        )
-        parser.add_argument(
-            "--speculative-ngram-max-match-window-size",
-            type=int,
-            default=ServerArgs.speculative_ngram_max_match_window_size,
-            help="The maximum window size for pattern matching in ngram speculative decoding.",
-        )
         parser.add_argument(
             "--speculative-ngram-min-bfs-breadth",
             type=int,
diff --git a/python/sglang/srt/speculative/cpp_ngram/ngram.cpp b/python/sglang/srt/speculative/cpp_ngram/ngram.cpp
@@ -13,23 +13,6 @@ Ngram::Ngram(size_t capacity, const Param& param) : param_(param) {
     throw std::runtime_error(
         "param_.max_trie_depth must be greater than 1, current value: " + std::to_string(param_.max_trie_depth));
   }
-  if (!(param_.min_match_window_size > 0)) {
-    throw std::runtime_error(
-        "min_match_window_size must be greater than 0, current value: " + std::to_string(param_.min_match_window_size));
-  }
-  if (!(param_.min_match_window_size <= param_.max_match_window_size)) {
-    throw std::runtime_error(
-        "min_match_window_size must be less than or equal to "
-        "max_match_window_size, current min_match_window_size: " +
-        std::to_string(param_.min_match_window_size) +
-        ", max_match_window_size: " + std::to_string(param_.max_match_window_size));
-  }
-  if (!(param_.max_match_window_size < param_.max_trie_depth)) {
-    throw std::runtime_error(
-        "max_match_window_size must be less than max_trie_depth, current "
-        "max_match_window_size: " +
-        std::to_string(param_.max_match_window_size) + ", max_trie_depth: " + std::to_string(param_.max_trie_depth));
-  }
   if (!(param_.min_bfs_breadth > 0)) {
     throw std::runtime_error(
         "min_bfs_breadth must be greater than 0, current value: " + std::to_string(param_.min_bfs_breadth));
@@ -53,20 +36,6 @@ Ngram::Ngram(size_t capacity, const Param& param) : param_(param) {
       }
     }
   }
-  for (auto config : param_.batch_min_match_window_size) {
-    if (config != std::numeric_limits<decltype(config)>::max()) {
-      if (!(config >= param_.min_match_window_size)) {
-        throw std::runtime_error(
-            "batch_min_match_window_size config value " + std::to_string(config) +
-            " must be greater than or equal to min_match_window_size: " + std::to_string(param_.min_match_window_size));
-      }
-      if (!(config <= param_.max_match_window_size)) {
-        throw std::runtime_error(
-            "batch_min_match_window_size config value " + std::to_string(config) +
-            " must be less than or equal to max_match_window_size: " + std::to_string(param_.max_match_window_size));
-      }
-    }
-  }
 
   trie_ = std::make_unique<Trie>(capacity, param_);
 
diff --git a/python/sglang/srt/speculative/cpp_ngram/ngram_corpus.py b/python/sglang/srt/speculative/cpp_ngram/ngram_corpus.py
@@ -26,8 +26,6 @@ class NgramCorpus:
     def __init__(
         self,
         max_trie_depth=18,
-        min_match_window_size=1,
-        max_match_window_size=10,
         min_bfs_breadth=1,
         max_bfs_breadth=8,
         draft_token_num=8,
@@ -36,8 +34,6 @@ def __init__(
     ):
         param = ngram_corpus_cpp.Param()
         param.max_trie_depth = max_trie_depth
-        param.min_match_window_size = min_match_window_size
-        param.max_match_window_size = max_match_window_size
         param.min_bfs_breadth = min_bfs_breadth
         param.max_bfs_breadth = max_bfs_breadth
         param.draft_token_num = draft_token_num
diff --git a/python/sglang/srt/speculative/cpp_ngram/ngram_corpus_binding.cpp b/python/sglang/srt/speculative/cpp_ngram/ngram_corpus_binding.cpp
@@ -21,17 +21,12 @@ PYBIND11_MODULE(ngram_corpus_cpp, m) {
       .def_readwrite("enable_router_mode", &Param::enable_router_mode)
       .def_readwrite("min_bfs_breadth", &Param::min_bfs_breadth)
       .def_readwrite("max_bfs_breadth", &Param::max_bfs_breadth)
-      .def_readwrite("min_match_window_size", &Param::min_match_window_size)
-      .def_readwrite("max_match_window_size", &Param::max_match_window_size)
       .def_readwrite("max_trie_depth", &Param::max_trie_depth)
       .def_readwrite("draft_token_num", &Param::draft_token_num)
       .def_readwrite("match_type", &Param::match_type)
-      .def_readwrite("batch_min_match_window_size", &Param::batch_min_match_window_size)
       .def_readwrite("batch_draft_token_num", &Param::batch_draft_token_num)
       .def("get_draft_token_num", &Param::get_draft_token_num, "")
-      .def("get_min_match_window_size", &Param::get_min_match_window_size, "")
       .def("parse", &Param::parse, "")
-      .def("resetBatchMinMatchWindowSize", &Param::resetBatchMinMatchWindowSize, "")
       .def("resetBatchReturnTokenNum", &Param::resetBatchReturnTokenNum, "")
       .def("detail", &Param::detail, "");
 
diff --git a/python/sglang/srt/speculative/cpp_ngram/param.h b/python/sglang/srt/speculative/cpp_ngram/param.h
@@ -17,13 +17,10 @@ struct Param {
   bool enable_router_mode;
   size_t min_bfs_breadth;
   size_t max_bfs_breadth;
-  size_t min_match_window_size;
-  size_t max_match_window_size;
   size_t max_trie_depth;
   size_t draft_token_num;
   std::string match_type;
 
-  std::vector<size_t> batch_min_match_window_size;
   std::vector<size_t> batch_draft_token_num;
 
   size_t get_draft_token_num(size_t batch_size) const {
@@ -36,16 +33,6 @@ struct Param {
     return draft_token_num - 1;
   }
 
-  size_t get_min_match_window_size(size_t batch_size) const {
-    if (batch_size < batch_min_match_window_size.size()) {
-      if (batch_min_match_window_size[batch_size] !=
-          std::numeric_limits<decltype(batch_min_match_window_size)::value_type>::max()) {
-        return batch_min_match_window_size[batch_size];
-      }
-    }
-    return min_match_window_size;
-  }
-
   std::vector<size_t> parse(const std::string& value) {
     // 0-1|10,2-3|20,
     std::vector<size_t> result;
@@ -96,10 +83,6 @@ struct Param {
     return result;
   }
 
-  void resetBatchMinMatchWindowSize(const std::string& value) {
-    batch_min_match_window_size = parse(value);
-  }
-
   void resetBatchReturnTokenNum(const std::string& value) {
     batch_draft_token_num = parse(value);
   }
@@ -108,13 +91,8 @@ struct Param {
     std::stringstream ss;
     ss << "enable = " << enable << ", enable_router_mode = " << enable_router_mode
        << ", min_bfs_breadth = " << min_bfs_breadth << ", max_bfs_breadth = " << max_bfs_breadth
-       << ", min_match_window_size = " << min_match_window_size << ", max_match_window_size = " << max_match_window_size
        << ", max_trie_depth = " << max_trie_depth << ", draft_token_num = " << draft_token_num
        << ", match_type = " << match_type;
-    ss << ", batch_min_match_window_size(" << batch_min_match_window_size.size() << ") = ";
-    for (int i = 0; i < batch_min_match_window_size.size(); ++i) {
-      ss << i << "|" << batch_min_match_window_size[i] << ",";
-    }
     ss << ", batch_draft_token_num(" << batch_draft_token_num.size() << ") = ";
     for (int i = 0; i < batch_draft_token_num.size(); ++i) {
       ss << i << "|" << batch_draft_token_num[i] << ",";
diff --git a/python/sglang/srt/speculative/cpp_ngram/trie.cpp b/python/sglang/srt/speculative/cpp_ngram/trie.cpp
@@ -19,7 +19,7 @@ Trie::Trie(size_t capacity, const Param& param) : param_(param) {
 }
 
 void Trie::insert(const int32_t* tokens, size_t len) {
-  for (size_t i = 0; i + param_.min_match_window_size < len; ++i) {
+  for (size_t i = 0; i < len; ++i) {
     auto start = tokens + i;
     auto end = start + std::min(len - i, param_.max_trie_depth);
 
@@ -100,14 +100,13 @@ void Trie::reset() {
   root_ = getNode();
 }
 
-std::vector<std::pair<TrieNode*, int32_t>>
-Trie::match(const int32_t* context, size_t len, size_t min_window, size_t max_window) const {
+std::vector<std::pair<TrieNode*, int32_t>> Trie::match(const int32_t* context, size_t len) const {
   std::vector<std::pair<TrieNode*, int32_t>> result;
-  result.reserve(max_window - min_window);
-  for (int32_t match_window_size = std::min(len, max_window); match_window_size >= static_cast<int32_t>(min_window);
-       --match_window_size) {
-    auto start = context + len - match_window_size;
-    auto end = start + match_window_size;
+  const auto max_match_depth = std::min(len, param_.max_trie_depth);
+  result.reserve(max_match_depth);
+  for (size_t match_depth = max_match_depth; match_depth > 0; --match_depth) {
+    auto start = context + len - match_depth;
+    auto end = start + match_depth;
     auto cursor = root_;
     while (start != end) {
       auto iter = cursor->child.find(*start);
@@ -118,27 +117,27 @@ Trie::match(const int32_t* context, size_t len, size_t min_window, size_t max_wi
       ++start;
       cursor = iter->second;
     }
-    if (cursor) {
-      result.emplace_back(std::make_pair(cursor, match_window_size));
+    if (cursor != nullptr && !cursor->child.empty()) {
+      result.emplace_back(cursor, static_cast<int32_t>(match_depth));
     }
   }
   return result;
 }
 
 Result Trie::buildRecency(
     const int32_t* context, size_t len, int32_t last_token, size_t draft_token_num, const Param& param) const {
-  auto anchors = match(context, len, param.min_match_window_size, param.max_match_window_size);
+  auto anchors = match(context, len);
 
-  double bfs_breadth_scale = double(param.max_bfs_breadth - param.min_bfs_breadth) /
-                             (param.max_match_window_size - param.min_match_window_size + 1);
+  const auto max_match_depth = std::max<int32_t>(1, static_cast<int32_t>(param.max_trie_depth - 1));
+  double bfs_breadth_scale = double(param.max_bfs_breadth - param.min_bfs_breadth) / max_match_depth;
 
   std::vector<Node> tree(draft_token_num + 1);
   int root = 0;
   int cursor = 1;
 
   for (auto [node, depth] : anchors) {
     std::queue<std::tuple<int32_t, double, const TrieNode*>> queue;
-    queue.push({root, (param.max_match_window_size - depth) * bfs_breadth_scale + param.min_bfs_breadth, node});
+    queue.push({root, (max_match_depth - depth) * bfs_breadth_scale + param.min_bfs_breadth, node});
     while (queue.size() && cursor <= static_cast<int>(draft_token_num)) {
       auto front = queue.front();
       queue.pop();
@@ -168,7 +167,7 @@ Result Trie::buildRecency(
 
 Result Trie::buildFrequency(
     const int32_t* context, size_t len, int32_t last_token, size_t draft_token_num, const Param& param) const {
-  auto anchors = match(context, len, param.min_match_window_size, param.max_match_window_size);
+  auto anchors = match(context, len);
 
   struct CompareByLastDouble {
     bool operator()(
diff --git a/python/sglang/srt/speculative/cpp_ngram/trie.h b/python/sglang/srt/speculative/cpp_ngram/trie.h
@@ -49,8 +49,7 @@ class Trie {
   void reset();
 
  private:
-  std::vector<std::pair<TrieNode*, int32_t>>
-  match(const int32_t* context, size_t len, size_t min_window, size_t max_window) const;
+  std::vector<std::pair<TrieNode*, int32_t>> match(const int32_t* context, size_t len) const;
 
   TrieNode* getNode() {
     auto node = node_pool_[--free_node_count_];
diff --git a/python/sglang/srt/speculative/ngram_worker.py b/python/sglang/srt/speculative/ngram_worker.py
@@ -41,18 +41,13 @@ def __init__(
         self.page_size = server_args.page_size
         self.draft_token_num: int = server_args.speculative_num_draft_tokens
         self.max_trie_depth: int = server_args.speculative_ngram_max_trie_depth
-        self.max_match_window_size: int = (
-            server_args.speculative_ngram_max_match_window_size
-        )
 
         self.max_batch_size = target_worker.max_running_requests
         self.device = f"cuda:{gpu_id}" if gpu_id >= 0 else "cuda"
 
         self._init_preallocated_tensors()
 
         self.ngram_corpus = NgramCorpus(
-            min_match_window_size=server_args.speculative_ngram_min_match_window_size,
-            max_match_window_size=server_args.speculative_ngram_max_match_window_size,
             min_bfs_breadth=server_args.speculative_ngram_min_bfs_breadth,
             max_bfs_breadth=server_args.speculative_ngram_max_bfs_breadth,
             match_type=server_args.speculative_ngram_match_type,
@@ -131,7 +126,7 @@ def _prepare_draft_tokens(
         batch_tokens = []
         for req in batch.reqs:
             check_token = self._efficient_concat_last_n(
-                req.origin_input_ids, req.output_ids, self.max_match_window_size
+                req.origin_input_ids, req.output_ids, self.max_trie_depth
             )
             batch_tokens.append(check_token)
         req_drafts, mask = self.ngram_corpus.batch_get(batch_tokens)
diff --git a/python/sglang/test/lora_utils.py b/python/sglang/test/lora_utils.py
@@ -768,8 +768,6 @@ def run_lora_multiple_batch_on_model_cases(
                 else {
                     "speculative_algorithm": "NGRAM",
                     "speculative_num_draft_tokens": 5,
-                    "speculative_ngram_min_match_window_size": 2,
-                    "speculative_ngram_max_match_window_size": 15,
                 }
             )
             srt_runner = SRTRunner(
diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py
@@ -574,8 +574,6 @@ def __init__(
         speculative_num_steps: Optional[int] = None,
         speculative_eagle_topk: Optional[int] = None,
         speculative_num_draft_tokens: Optional[int] = None,
-        speculative_ngram_min_match_window_size: Optional[int] = None,
-        speculative_ngram_max_match_window_size: Optional[int] = None,
         disable_overlap_schedule: bool = False,
         disable_custom_all_reduce: bool = False,
         torchao_config: Optional[str] = None,
@@ -606,12 +604,7 @@ def __init__(
             spec_kwargs["speculative_num_draft_tokens"] = speculative_num_draft_tokens
         elif speculative_algorithm == "NGRAM":
             spec_kwargs["speculative_algorithm"] = speculative_algorithm
-            spec_kwargs["speculative_ngram_min_match_window_size"] = (
-                speculative_ngram_min_match_window_size
-            )
-            spec_kwargs["speculative_ngram_max_match_window_size"] = (
-                speculative_ngram_max_match_window_size
-            )
+            spec_kwargs["speculative_num_draft_tokens"] = speculative_num_draft_tokens
 
         self.engine = Engine(
             model_path=model_path,
diff --git a/test/registered/spec/utils/test_ngram_corpus.py b/test/registered/spec/utils/test_ngram_corpus.py