@@ -20988,14 +20988,106 @@ int32_t llama_tokenize(
20988
20988
return llama_tokenize_impl(model->vocab, text, text_len, tokens, n_tokens_max, add_special, parse_special);
20989
20989
}
20990
20990
20991
- int32_t llama_token_to_piece(
20992
- const struct llama_model * model,
20993
- llama_token token,
20994
- char * buf,
20995
- int32_t length,
20996
- int32_t lstrip,
20997
- bool special) {
20998
- return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
20991
+ // errors: 'c': copy, 'i': ignore, 'r': replace 0xFFFD, 'v': verbose
20992
+ static std::string llama_decode_text(const std::string & text, const char errors = 'v') {
20993
+ std::string decoded_text;
20994
+
20995
+ const auto cpts = unicode_cpts_from_utf8(text);
20996
+ for (const auto cpt : cpts) {
20997
+ const auto utf8 = unicode_cpt_to_utf8(cpt);
20998
+ try {
20999
+ decoded_text += unicode_utf8_to_byte(utf8);
21000
+ } catch (const std::out_of_range & /*e*/) {
21001
+ switch (errors) {
21002
+ case 'c':
21003
+ decoded_text += utf8; // copy original
21004
+ break;
21005
+ case 'r':
21006
+ decoded_text += "\xEF\xBF\xBD"; // 0xFFFD REPLACEMENT CHARACTER
21007
+ break;
21008
+ case 'v':
21009
+ decoded_text += format("[UNK_BYTE_0x%02X]", cpt);
21010
+ break;
21011
+ case 'i':
21012
+ default:
21013
+ // ignore
21014
+ break;
21015
+ }
21016
+ }
21017
+ }
21018
+
21019
+ return decoded_text;
21020
+ }
21021
+
21022
+ // does not write null-terminator to buf
21023
+ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) {
21024
+ // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
21025
+ static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
21026
+ const llama_token_attr attr = llama_token_get_attr(model, token);
21027
+ if (!special && (attr & attr_special)) {
21028
+ return 0;
21029
+ }
21030
+
21031
+ // copy piece chars to output text buffer
21032
+ // skip up to 'lstrip' leading spaces before copying
21033
+ auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
21034
+ for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
21035
+ token++;
21036
+ size--;
21037
+ }
21038
+ if (length < (int32_t)size) {
21039
+ return (int32_t) -size;
21040
+ }
21041
+ memcpy(buf, token, size);
21042
+ return (int32_t) size;
21043
+ };
21044
+
21045
+ // if we have a cache - use it
21046
+ {
21047
+ const auto & cache = model->vocab.cache_token_to_piece;
21048
+
21049
+ if (!cache.empty()) {
21050
+ const auto & result = cache.at(token);
21051
+ return _try_copy(result.data(), result.size());
21052
+ }
21053
+ }
21054
+
21055
+ if (0 <= token && token < llama_n_vocab(model)) {
21056
+ const std::string & token_text = model->vocab.id_to_token[token].text;
21057
+ switch (llama_vocab_get_type(model->vocab)) {
21058
+ case LLAMA_VOCAB_TYPE_WPM:
21059
+ case LLAMA_VOCAB_TYPE_SPM:
21060
+ case LLAMA_VOCAB_TYPE_UGM: {
21061
+ // NOTE: we accept all unsupported token types,
21062
+ // suppressing them like CONTROL tokens.
21063
+ if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
21064
+ return _try_copy(token_text.data(), token_text.size());
21065
+ } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
21066
+ std::string result = token_text;
21067
+ llama_unescape_whitespace(result);
21068
+ return _try_copy(result.data(), result.size());
21069
+ } else if (attr & LLAMA_TOKEN_ATTR_BYTE) {
21070
+ char byte = (char) llama_token_to_byte(model->vocab, token);
21071
+ return _try_copy((char*) &byte, 1);
21072
+ }
21073
+ break;
21074
+ }
21075
+ case LLAMA_VOCAB_TYPE_BPE: {
21076
+ // NOTE: we accept all unsupported token types,
21077
+ // suppressing them like CONTROL tokens.
21078
+ if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
21079
+ return _try_copy(token_text.data(), token_text.size());
21080
+ } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
21081
+ std::string result = llama_decode_text(token_text, 'c'); // copy on error //TODO: use a tokenizer variable
21082
+ return _try_copy(result.data(), result.size());
21083
+ }
21084
+ break;
21085
+ }
21086
+ default:
21087
+ GGML_ASSERT(false);
21088
+ }
21089
+ }
21090
+ return 0;
20999
21091
}
21000
21092
21001
21093
int32_t llama_detokenize(
0 commit comments