Skip to content

Commit 450c20a

Browse files
jaime-m-pNexesenex
authored andcommitted
Detokenizer fixes (ggml-org#8039)
* Add llama_detokenize(): - Update header files location - UNKNOWN and CONTROL are 'special pieces' - Remove space after UNKNOWN and CONTROL - Refactor llama_token_to_piece() - Add flag: clean_up_tokenization_spaces - Symmetric params for llama_tokenize() and llama_detokenize() * Update and fix tokenizer tests: - Using llama_detokenize() - Unexpected vocab type as test fail instead of error - Useful when automating tests: - If you don't know in advance the vocab type - Differenciate other loading errors - Skip unicode surrogaes and undefined - Gracefully exit threads - Using exit() is throwing random exceptions - Clean old known problematic codepoints - Minor: confusing hexadecimal codepoint * Update bruteforce random tests - Add detokenizer checks - New generator: ascii_lr_strip - New generator: apostrophe - Add more vocabs files - Detokenize special tokens. - Replace errors with '\uFFFD' when detokenizing to 'utf-8' - More edge cases - Better detokenization results check * Fix add_space_prefix, set false by default * Better leading space removal * Do not remove space when decoding special tokens * Bugfix: custom regexs splits undefined unicode codepoints * 'viking' detokenizer clean spaces
1 parent cfc0ee1 commit 450c20a

File tree

11 files changed

+1358
-143
lines changed

11 files changed

+1358
-143
lines changed

common/common.cpp

Lines changed: 21 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2617,51 +2617,35 @@ std::vector<llama_token> llama_tokenize(
26172617
}
26182618

26192619
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
2620-
std::vector<char> result(8, 0);
2621-
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
2622-
if (n_tokens < 0) {
2623-
result.resize(-n_tokens);
2624-
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
2625-
GGML_ASSERT(check == -n_tokens);
2626-
} else {
2627-
result.resize(n_tokens);
2628-
}
2629-
2630-
return std::string(result.data(), result.size());
2631-
}
2632-
2633-
std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
2634-
const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
2635-
26362620
std::string piece;
2637-
std::string result;
2638-
2639-
for (size_t i = 0; i < tokens.size(); ++i) {
2640-
piece = llama_token_to_piece(ctx, tokens[i]);
2641-
2642-
// remove the leading space of the first non-BOS token
2643-
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
2644-
piece = piece.substr(1);
2645-
}
2646-
2647-
result += piece;
2621+
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
2622+
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
2623+
if (n_chars < 0) {
2624+
piece.resize(-n_chars);
2625+
int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
2626+
GGML_ASSERT(check == -n_chars);
2627+
}
2628+
else {
2629+
piece.resize(n_chars);
26482630
}
26492631

2650-
return result;
2632+
return piece;
26512633
}
26522634

2653-
std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
2654-
std::string piece;
2655-
std::string result;
2656-
2657-
for (size_t i = 0; i < tokens.size(); ++i) {
2658-
piece = llama_token_to_piece(ctx, tokens[i]);
2659-
2660-
result += piece;
2635+
std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
2636+
std::string text;
2637+
text.resize(std::max(text.capacity(), tokens.size()));
2638+
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
2639+
if (n_chars < 0) {
2640+
text.resize(-n_chars);
2641+
n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
2642+
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
26612643
}
26622644

2645+
text.resize(n_chars);
2646+
26632647
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
2664-
return result;
2648+
return text;
26652649
}
26662650

26672651
bool llama_should_add_bos_token(const llama_model * model) {

common/common.h

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -367,21 +367,13 @@ std::string llama_token_to_piece(
367367
llama_token token,
368368
bool special = true);
369369

370-
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
371-
// that takes into account the tokenizer type and decides how to handle the leading space
372-
//
373-
// detokenizes a vector of tokens into a string
374-
// should work similar to Python's `tokenizer.decode`
375-
// removes the leading space from the first non-BOS token
376-
std::string llama_detokenize_spm(
377-
llama_context * ctx,
378-
const std::vector<llama_token> & tokens);
379-
380370
// detokenizes a vector of tokens into a string
381371
// should work similar to Python's `tokenizer.decode`
382-
std::string llama_detokenize_bpe(
372+
// optionally renders special/control tokens
373+
std::string llama_detokenize(
383374
llama_context * ctx,
384-
const std::vector<llama_token> & tokens);
375+
const std::vector<llama_token> & tokens,
376+
bool special = true);
385377

386378
// Uses the value from the model metadata if possible, otherwise
387379
// defaults to true when model type is SPM, otherwise false.

examples/batched.swift/Sources/main.swift

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
229229

230230
private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
231231
var result = [CChar](repeating: 0, count: 8)
232-
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false)
232+
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false)
233233
if nTokens < 0 {
234234
let actualTokensCount = -Int(nTokens)
235235
result = .init(repeating: 0, count: actualTokensCount)
@@ -238,6 +238,7 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
238238
token,
239239
&result,
240240
Int32(result.count),
241+
0,
241242
false
242243
)
243244
assert(check == actualTokensCount)

examples/llama.swiftui/llama.cpp.swift/LibLlama.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -322,15 +322,15 @@ actor LlamaContext {
322322
defer {
323323
result.deallocate()
324324
}
325-
let nTokens = llama_token_to_piece(model, token, result, 8, false)
325+
let nTokens = llama_token_to_piece(model, token, result, 8, 0, false)
326326

327327
if nTokens < 0 {
328328
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
329329
newResult.initialize(repeating: Int8(0), count: Int(-nTokens))
330330
defer {
331331
newResult.deallocate()
332332
}
333-
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
333+
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false)
334334
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
335335
return Array(bufferPointer)
336336
} else {

0 commit comments

Comments
 (0)