Skip to content

Commit 4e4205a

Browse files
compiladeNeo Zhang
authored and
Neo Zhang
committed
tokenize : add --no-parse-special option (ggml-org#8423)
This should allow more easily explaining how parse_special affects tokenization.
1 parent 2ed5fd5 commit 4e4205a

File tree

1 file changed

+7
-1
lines changed

1 file changed

+7
-1
lines changed

examples/tokenize/tokenize.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ static void print_usage_information(const char * argv0, FILE * stream) {
2929
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
3030
fprintf(stream, " --stdin read prompt from standard input.\n");
3131
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
32+
fprintf(stream, " --no-parse-special do not parse control tokens.\n");
3233
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
3334
fprintf(stream, " --show-count print the total number of tokens.\n");
3435
}
@@ -195,6 +196,7 @@ int main(int raw_argc, char ** raw_argv) {
195196
// variables where to put any arguments we see.
196197
bool printing_ids = false;
197198
bool no_bos = false;
199+
bool no_parse_special = false;
198200
bool disable_logging = false;
199201
bool show_token_count = false;
200202
const char * model_path = NULL;
@@ -229,6 +231,9 @@ int main(int raw_argc, char ** raw_argv) {
229231
else if (arg == "--no-bos") {
230232
no_bos = true;
231233
}
234+
else if (arg == "--no-parse-special") {
235+
no_parse_special = true;
236+
}
232237
else if (arg == "-p" || arg == "--prompt") {
233238
if (prompt_set) {
234239
fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
@@ -359,9 +364,10 @@ int main(int raw_argc, char ** raw_argv) {
359364

360365
const bool model_wants_add_bos = llama_should_add_bos_token(model);
361366
const bool add_bos = model_wants_add_bos && !no_bos;
367+
const bool parse_special = !no_parse_special;
362368

363369
std::vector<llama_token> tokens;
364-
tokens = ::llama_tokenize(model, prompt, add_bos, true);
370+
tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);
365371

366372
if (printing_ids) {
367373
printf("[");

0 commit comments

Comments
 (0)