remove Tail-Free sampling, ggml-org/llama.cpp#10071

JamePeng · JamePeng · commit fef4f253cae9 · 2025-03-09T15:37:18.000+08:00
more top_n_sigma、xtc_threshold: float = 0.1、xtc_probability: float params
diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py
@@ -21,8 +21,9 @@ class GptParams:
     ignore_eos: bool = False
     logit_bias: dict[int, float] = field(default_factory=dict)
     top_k: int = 40
+    top_n_sigma: float = -1.00
     top_p: float = 0.95
-    tfs_z: float = 1.00
+
     typical_p: float = 1.00
     temp: float = 0.80
     repeat_penalty: float = 1.10
@@ -32,7 +33,8 @@ class GptParams:
     mirostat: int = 0
     mirostat_tau: float = 5.0
     mirostat_eta: float = 0.1
-
+    xtc_threshold: float = 0.1
+    xtc_probability: float = 0.0
     model: str = "./models/llama-7B/ggml-model.bin"
     prompt: str = ""
     path_session: str = ""
@@ -147,14 +149,10 @@ def gpt_params_parse(argv=None):
         "--top_k", type=int, default=40, help="top-k sampling", dest="top_k"
     )
     parser.add_argument(
-        "--top_p", type=float, default=0.95, help="top-p samplin", dest="top_p"
+        "--top_n_sigma", type=int, default=40, help="top-n-sigma sampling", dest="top_n_sigma"
     )
     parser.add_argument(
-        "--tfs",
-        type=float,
-        default=1.0,
-        help="tail free sampling, parameter z (1.0 = disabled)",
-        dest="tfs_z",
+        "--top_p", type=float, default=0.95, help="top-p samplin", dest="top_p"
     )
     parser.add_argument(
         "--temp", type=float, default=0.80, help="temperature", dest="temp"
@@ -178,7 +176,7 @@ def gpt_params_parse(argv=None):
         type=float,
         default=0.0,
         help="repeat alpha frequency penalty (0.0 = disabled)",
-        dest="tfs_z",
+        dest="frequency_penalty",
     )
     parser.add_argument(
         "--presence_penalty",
@@ -209,6 +207,22 @@ def gpt_params_parse(argv=None):
         dest="mirostat_eta",
     )
 
+    parser.add_argument(
+        "--xtc_threshold",
+        type=float,
+        default=0.1,
+        help=" Sets a minimum probability threshold for tokens to be removed (default: 0.1)",
+        dest="xtc_threshold",
+    )
+
+    parser.add_argument(
+        "--xtc_probability",
+        type=float,
+        default=0.0,
+        help="ets the chance for token removal (checked once on sampler start) (default: 0.0)",
+        dest="xtc_probability",
+    )
+
     parser.add_argument(
         "-m",
         "--model",
diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -275,14 +275,17 @@ def __init__(self, params: GptParams) -> None:
 presence_penalty = {self.params.presence_penalty},\
 frequency_penalty = {self.params.frequency_penalty},\
 top_k = {self.params.top_k},\
-tfs_z = {self.params.tfs_z},\
+top_n_sigma  = {self.params.top_n_sigma},\
 top_p = {self.params.top_p},\
 typical_p = {self.params.typical_p},\
 temp = {self.params.temp},\
 mirostat = {self.params.mirostat},\
 mirostat_lr = {self.params.mirostat_eta},\
 mirostat_ent = {self.params.mirostat_tau},\
 
+xtc_threshold = {self.params.xtc_threshold},\
+xtc_probability = {self.params.xtc_probability},\
+
 generate: n_ctx = {self.n_ctx},\
 n_batch = {self.params.n_batch},\
 n_predict = {self.params.n_predict},\
@@ -454,7 +457,7 @@ def generate(self):
                 _arr = (llama_cpp.llama_token * last_n_repeat)(
                     *self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat :]
                 )
-                llama_cpp.llama_sample_repetition_penalties(
+                llama_cpp.llama_sampler_init_penalties(
                     ctx=self.ctx,
                     candidates=candidates_p,
                     last_tokens_data=_arr,
@@ -474,15 +477,15 @@ def generate(self):
 
                 if self.params.temp <= 0:
                     # Greedy sampling
-                    id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p)
+                    id = llama_cpp.llama_sampler_init_greedy(self.ctx, candidates_p)
                 else:
                     if self.params.mirostat == 1:
                         mirostat_mu = 2.0 * self.params.mirostat_tau
                         mirostat_m = 100
-                        llama_cpp.llama_sample_temperature(
+                        llama_cpp.llama_sampler_init_temp(
                             self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)
                         )
-                        id = llama_cpp.llama_sample_token_mirostat(
+                        id = llama_cpp.llama_sampler_init_mirostat(
                             self.ctx,
                             candidates_p,
                             llama_cpp.c_float(self.params.mirostat_tau),
@@ -495,7 +498,7 @@ def generate(self):
                         llama_cpp.llama_sample_temperature(
                             self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)
                         )
-                        id = llama_cpp.llama_sample_token_mirostat_v2(
+                        id = llama_cpp.llama_sampler_init_mirostat_v2(
                             self.ctx,
                             candidates_p,
                             llama_cpp.c_float(self.params.mirostat_tau),
@@ -504,31 +507,31 @@ def generate(self):
                         )
                     else:
                         # Temperature sampling
-                        llama_cpp.llama_sample_top_k(
+                        llama_cpp.llama_sampler_init_top_k(
                             self.ctx,
                             candidates_p,
                             top_k,
                             min_keep=llama_cpp.c_size_t(1),
                         )
-                        llama_cpp.llama_sample_tail_free(
+                        llama_cpp.llama_sampler_init_top_n_sigma(
                             self.ctx,
                             candidates_p,
-                            llama_cpp.c_float(self.params.tfs_z),
+                            llama_cpp.c_float(self.params.top_n_sigma),
                             min_keep=llama_cpp.c_size_t(1),
                         )
-                        llama_cpp.llama_sample_typical(
+                        llama_cpp.llama_sampler_init_typical(
                             self.ctx,
                             candidates_p,
                             llama_cpp.c_float(self.params.typical_p),
                             min_keep=llama_cpp.c_size_t(1),
                         )
-                        llama_cpp.llama_sample_top_p(
+                        llama_cpp.llama_sampler_init_top_p(
                             self.ctx,
                             candidates_p,
                             llama_cpp.c_float(self.params.top_p),
                             min_keep=llama_cpp.c_size_t(1),
                         )
-                        llama_cpp.llama_sample_temperature(
+                        llama_cpp.llama_sampler_init_temp(
                             self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)
                         )
                         id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -570,9 +570,9 @@ class LlamaSamplingParams:
     n_prev: int = 64
     n_probs: int = 0
     top_k: int = 40
+    top_n_sigma: float = -1.00
     top_p: float = 0.95
     min_p: float = 0.05
-    tfs_z: float = 1.00
     typical_p: float = 1.00
     temp: float = 0.80
     penalty_last_n: int = 64
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -677,7 +677,6 @@ def _init_sampler(
         repeat_penalty: float = 1.0,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
-        tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_eta: float = 0.1,
         mirostat_tau: float = 5.0,
@@ -771,7 +770,6 @@ def sample(
         repeat_penalty: float = 1.0,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
-        tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_eta: float = 0.1,
         mirostat_tau: float = 5.0,
@@ -809,7 +807,6 @@ def sample(
                 repeat_penalty=repeat_penalty,
                 frequency_penalty=frequency_penalty,
                 presence_penalty=presence_penalty,
-                tfs_z=tfs_z,
                 mirostat_mode=mirostat_mode,
                 mirostat_tau=mirostat_tau,
                 mirostat_eta=mirostat_eta,
@@ -841,7 +838,6 @@ def generate(
         reset: bool = True,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
-        tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
@@ -883,7 +879,6 @@ def generate(
             repeat_penalty=repeat_penalty,
             frequency_penalty=frequency_penalty,
             presence_penalty=presence_penalty,
-            tfs_z=tfs_z,
             mirostat_mode=mirostat_mode,
             mirostat_tau=mirostat_tau,
             mirostat_eta=mirostat_eta,
@@ -938,7 +933,6 @@ def generate(
                     repeat_penalty=repeat_penalty,
                     frequency_penalty=frequency_penalty,
                     presence_penalty=presence_penalty,
-                    tfs_z=tfs_z,
                     mirostat_mode=mirostat_mode,
                     mirostat_tau=mirostat_tau,
                     mirostat_eta=mirostat_eta,
@@ -1157,7 +1151,6 @@ def _create_completion(
         top_n_sigma: float = -1.00,
         stream: bool = False,
         seed: Optional[int] = None,
-        tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
@@ -1348,7 +1341,6 @@ def logit_bias_processor(
             min_p=min_p,
             typical_p=typical_p,
             temp=temperature,
-            tfs_z=tfs_z,
             mirostat_mode=mirostat_mode,
             mirostat_tau=mirostat_tau,
             mirostat_eta=mirostat_eta,
@@ -1783,7 +1775,6 @@ def create_completion(
         top_n_sigma: float = -1.00,
         stream: bool = False,
         seed: Optional[int] = None,
-        tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
@@ -1815,7 +1806,6 @@ def create_completion(
             top_n_sigma: Limit the next token selection to a subset of tokens with pre-softmax logits that are within n * σ less than the max logit (default: -1.00, -1.00 = disabled).
             stream: Whether to stream the results.
             seed: The seed to use for sampling.
-            tfs_z: The tail-free sampling parameter. Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
             mirostat_mode: The mirostat sampling mode.
             mirostat_tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
             mirostat_eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
@@ -1852,7 +1842,6 @@ def create_completion(
             top_n_sigma=top_n_sigma,
             stream=stream,
             seed=seed,
-            tfs_z=tfs_z,
             mirostat_mode=mirostat_mode,
             mirostat_tau=mirostat_tau,
             mirostat_eta=mirostat_eta,
@@ -1889,7 +1878,6 @@ def __call__(
         top_n_sigma: float = -1.00,
         stream: bool = False,
         seed: Optional[int] = None,
-        tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
@@ -1921,7 +1909,6 @@ def __call__(
             top_n_sigma: Limit the next token selection to a subset of tokens with pre-softmax logits that are within n * σ less than the max logit (default: -1.00, -1.00 = disabled).
             stream: Whether to stream the results.
             seed: The seed to use for sampling.
-            tfs_z: The tail-free sampling parameter. Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
             mirostat_mode: The mirostat sampling mode.
             mirostat_tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
             mirostat_eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
@@ -1958,7 +1945,6 @@ def __call__(
             top_n_sigma=top_n_sigma,
             stream=stream,
             seed=seed,
-            tfs_z=tfs_z,
             mirostat_mode=mirostat_mode,
             mirostat_tau=mirostat_tau,
             mirostat_eta=mirostat_eta,
@@ -1992,7 +1978,6 @@ def create_chat_completion(
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.0,
-        tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
@@ -2029,7 +2014,6 @@ def create_chat_completion(
             presence_penalty: The penalty to apply to tokens based on their presence in the prompt.
             frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt.
             repeat_penalty: The penalty to apply to repeated tokens.
-            tfs_z: The tail-free sampling parameter.
             mirostat_mode: The mirostat sampling mode.
             mirostat_tau: The mirostat sampling tau parameter.
             mirostat_eta: The mirostat sampling eta parameter.
@@ -2071,7 +2055,6 @@ def create_chat_completion(
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             repeat_penalty=repeat_penalty,
-            tfs_z=tfs_z,
             mirostat_mode=mirostat_mode,
             mirostat_tau=mirostat_tau,
             mirostat_eta=mirostat_eta,
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py