@@ -376,7 +376,6 @@ struct llama_client_slot
376
376
377
377
int32_t num_prompt_tokens = 0 ;
378
378
int32_t num_prompt_tokens_processed = 0 ;
379
- int32_t multibyte_pending = 0 ;
380
379
381
380
json prompt;
382
381
std::string generated_text;
@@ -425,7 +424,6 @@ struct llama_client_slot
425
424
stopped_word = false ;
426
425
stopped_limit = false ;
427
426
stopping_word = " " ;
428
- multibyte_pending = 0 ;
429
427
n_past = 0 ;
430
428
sent_count = 0 ;
431
429
sent_token_probs_index = 0 ;
@@ -992,35 +990,36 @@ struct llama_server_context
992
990
slot.generated_text += token_str;
993
991
slot.has_next_token = true ;
994
992
995
- if (slot.multibyte_pending > 0 )
993
+ // check if there is incomplete UTF-8 character at the end
994
+ bool incomplete = false ;
995
+ for (unsigned i = 1 ; i < 5 && i <= slot.generated_text .size (); ++i)
996
996
{
997
- slot. multibyte_pending -= token_str. size ();
998
- }
999
- else if (token_str. size () == 1 )
1000
- {
1001
- const char c = token_str[ 0 ] ;
1002
- // 2-byte characters: 110xxxxx 10xxxxxx
997
+ unsigned char c = slot. generated_text [slot. generated_text . size () - i] ;
998
+ if ((c & 0xC0 ) == 0x80 )
999
+ {
1000
+ // continuation byte: 10xxxxxx
1001
+ continue ;
1002
+ }
1003
1003
if ((c & 0xE0 ) == 0xC0 )
1004
1004
{
1005
- slot. multibyte_pending = 1 ;
1006
- // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
1005
+ // 2-byte character: 110xxxxx ...
1006
+ incomplete = i < 2 ;
1007
1007
}
1008
1008
else if ((c & 0xF0 ) == 0xE0 )
1009
1009
{
1010
- slot. multibyte_pending = 2 ;
1011
- // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1010
+ // 3-byte character: 1110xxxx ...
1011
+ incomplete = i < 3 ;
1012
1012
}
1013
1013
else if ((c & 0xF8 ) == 0xF0 )
1014
1014
{
1015
- slot.multibyte_pending = 3 ;
1016
- }
1017
- else
1018
- {
1019
- slot.multibyte_pending = 0 ;
1015
+ // 4-byte character: 11110xxx ...
1016
+ incomplete = i < 4 ;
1020
1017
}
1018
+ // else 1-byte character or invalid byte
1019
+ break ;
1021
1020
}
1022
1021
1023
- if (slot. multibyte_pending == 0 )
1022
+ if (!incomplete )
1024
1023
{
1025
1024
size_t pos = std::min (slot.sent_count , slot.generated_text .size ());
1026
1025
const std::string str_test = slot.generated_text .substr (pos);
@@ -1055,7 +1054,7 @@ struct llama_server_context
1055
1054
}
1056
1055
}
1057
1056
1058
- if (slot. multibyte_pending > 0 && !slot. has_next_token )
1057
+ if (incomplete )
1059
1058
{
1060
1059
slot.has_next_token = true ;
1061
1060
}
0 commit comments