Skip to content

Commit 0b1fb09

Browse files
committed
Tokenizer fixes ggml-org#8379
by jaime-m-p
1 parent 7d88961 commit 0b1fb09

File tree

1 file changed

+100
-8
lines changed

1 file changed

+100
-8
lines changed

src/llama.cpp

Lines changed: 100 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20988,14 +20988,106 @@ int32_t llama_tokenize(
2098820988
return llama_tokenize_impl(model->vocab, text, text_len, tokens, n_tokens_max, add_special, parse_special);
2098920989
}
2099020990

20991-
int32_t llama_token_to_piece(
20992-
const struct llama_model * model,
20993-
llama_token token,
20994-
char * buf,
20995-
int32_t length,
20996-
int32_t lstrip,
20997-
bool special) {
20998-
return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
20991+
// errors: 'c': copy, 'i': ignore, 'r': replace 0xFFFD, 'v': verbose
20992+
static std::string llama_decode_text(const std::string & text, const char errors = 'v') {
20993+
std::string decoded_text;
20994+
20995+
const auto cpts = unicode_cpts_from_utf8(text);
20996+
for (const auto cpt : cpts) {
20997+
const auto utf8 = unicode_cpt_to_utf8(cpt);
20998+
try {
20999+
decoded_text += unicode_utf8_to_byte(utf8);
21000+
} catch (const std::out_of_range & /*e*/) {
21001+
switch (errors) {
21002+
case 'c':
21003+
decoded_text += utf8; // copy original
21004+
break;
21005+
case 'r':
21006+
decoded_text += "\xEF\xBF\xBD"; // 0xFFFD REPLACEMENT CHARACTER
21007+
break;
21008+
case 'v':
21009+
decoded_text += format("[UNK_BYTE_0x%02X]", cpt);
21010+
break;
21011+
case 'i':
21012+
default:
21013+
// ignore
21014+
break;
21015+
}
21016+
}
21017+
}
21018+
21019+
return decoded_text;
21020+
}
21021+
21022+
// does not write null-terminator to buf
21023+
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) {
21024+
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
21025+
static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
21026+
const llama_token_attr attr = llama_token_get_attr(model, token);
21027+
if (!special && (attr & attr_special)) {
21028+
return 0;
21029+
}
21030+
21031+
// copy piece chars to output text buffer
21032+
// skip up to 'lstrip' leading spaces before copying
21033+
auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
21034+
for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
21035+
token++;
21036+
size--;
21037+
}
21038+
if (length < (int32_t)size) {
21039+
return (int32_t) -size;
21040+
}
21041+
memcpy(buf, token, size);
21042+
return (int32_t) size;
21043+
};
21044+
21045+
// if we have a cache - use it
21046+
{
21047+
const auto & cache = model->vocab.cache_token_to_piece;
21048+
21049+
if (!cache.empty()) {
21050+
const auto & result = cache.at(token);
21051+
return _try_copy(result.data(), result.size());
21052+
}
21053+
}
21054+
21055+
if (0 <= token && token < llama_n_vocab(model)) {
21056+
const std::string & token_text = model->vocab.id_to_token[token].text;
21057+
switch (llama_vocab_get_type(model->vocab)) {
21058+
case LLAMA_VOCAB_TYPE_WPM:
21059+
case LLAMA_VOCAB_TYPE_SPM:
21060+
case LLAMA_VOCAB_TYPE_UGM: {
21061+
// NOTE: we accept all unsupported token types,
21062+
// suppressing them like CONTROL tokens.
21063+
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
21064+
return _try_copy(token_text.data(), token_text.size());
21065+
} else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
21066+
std::string result = token_text;
21067+
llama_unescape_whitespace(result);
21068+
return _try_copy(result.data(), result.size());
21069+
} else if (attr & LLAMA_TOKEN_ATTR_BYTE) {
21070+
char byte = (char) llama_token_to_byte(model->vocab, token);
21071+
return _try_copy((char*) &byte, 1);
21072+
}
21073+
break;
21074+
}
21075+
case LLAMA_VOCAB_TYPE_BPE: {
21076+
// NOTE: we accept all unsupported token types,
21077+
// suppressing them like CONTROL tokens.
21078+
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
21079+
return _try_copy(token_text.data(), token_text.size());
21080+
} else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
21081+
std::string result = llama_decode_text(token_text, 'c'); // copy on error //TODO: use a tokenizer variable
21082+
return _try_copy(result.data(), result.size());
21083+
}
21084+
break;
21085+
}
21086+
default:
21087+
GGML_ASSERT(false);
21088+
}
21089+
}
21090+
return 0;
2099921091
}
2100021092

2100121093
int32_t llama_detokenize(

0 commit comments

Comments
 (0)