@@ -2144,6 +2144,9 @@ def set_vocab(self):
2144
2144
toktype = SentencePieceTokenTypes .UNUSED
2145
2145
elif tokenizer .IsByte (token_id ):
2146
2146
toktype = SentencePieceTokenTypes .BYTE
2147
+ # take care of ununsed raw token
2148
+ if piece .startswith ('[UNUSED' ):
2149
+ toktype = SentencePieceTokenTypes .UNKNOWN
2147
2150
2148
2151
tokens .append (text )
2149
2152
scores .append (score )
@@ -2159,6 +2162,47 @@ def set_vocab(self):
2159
2162
scores .append (- 1000.0 )
2160
2163
toktypes .append (SentencePieceTokenTypes .USER_DEFINED )
2161
2164
2165
+ chat_eos_token = '<|im_end|>'
2166
+ chat_eos_token_id = None
2167
+
2168
+ tokenizer_config_file = self .dir_model / 'tokenizer_config.json'
2169
+ if tokenizer_config_file .is_file ():
2170
+ with open (tokenizer_config_file , "r" , encoding = "utf-8" ) as f :
2171
+ tokenizer_config_json = json .load (f )
2172
+ added_tokens_decoder = tokenizer_config_json .get ("added_tokens_decoder" , {})
2173
+ for token_id , foken_data in added_tokens_decoder .items ():
2174
+ token_id = int (token_id )
2175
+ token = foken_data ["content" ]
2176
+ if token == chat_eos_token :
2177
+ chat_eos_token_id = token_id
2178
+ token = token .encode ("utf-8" )
2179
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
2180
+ assert (tokens [token_id ] == token )
2181
+ tokens [token_id ] = token
2182
+ scores [token_id ] = - 1000.0
2183
+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
2184
+ if foken_data .get ("special" ):
2185
+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
2186
+
2187
+ tokenizer_file = self .dir_model / 'tokenizer.json'
2188
+ if tokenizer_file .is_file ():
2189
+ with open (tokenizer_file , "r" , encoding = "utf-8" ) as f :
2190
+ tokenizer_json = json .load (f )
2191
+ added_tokens = tokenizer_json .get ("added_tokens" , [])
2192
+ for foken_data in added_tokens :
2193
+ token_id = int (foken_data ["id" ])
2194
+ token = foken_data ["content" ]
2195
+ if token == chat_eos_token :
2196
+ chat_eos_token_id = token_id
2197
+ token = token .encode ("utf-8" )
2198
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
2199
+ assert (tokens [token_id ] == token )
2200
+ tokens [token_id ] = token
2201
+ scores [token_id ] = - 1000.0
2202
+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
2203
+ if foken_data .get ("special" ):
2204
+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
2205
+
2162
2206
self .gguf_writer .add_tokenizer_model ("llama" )
2163
2207
self .gguf_writer .add_tokenizer_pre ("default" )
2164
2208
self .gguf_writer .add_token_list (tokens )
@@ -2168,28 +2212,16 @@ def set_vocab(self):
2168
2212
2169
2213
special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
2170
2214
old_eos = special_vocab .special_token_ids ["eos" ]
2171
- if "chat" in os . path . basename ( self . dir_model . absolute ()) :
2215
+ if chat_eos_token_id is not None :
2172
2216
# For the chat model, we replace the eos with '<|im_end|>'.
2173
2217
# TODO: this is a hack, should be fixed
2174
2218
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
2175
- special_vocab .special_token_ids ["eos" ] = self . _try_get_sft_eos ( tokenizer )
2176
- logger .warning (f"Replace eos:{ old_eos } with a special token:{ special_vocab . special_token_ids [ 'eos' ] } \
2177
- in chat mode so that the conversation can end normally." )
2219
+ special_vocab .special_token_ids ["eos" ] = chat_eos_token_id
2220
+ logger .warning (f"Replace eos:{ old_eos } with a special token:{ chat_eos_token_id } "
2221
+ " in chat mode so that the conversation can end normally." )
2178
2222
2179
2223
special_vocab .add_to_gguf (self .gguf_writer )
2180
2224
2181
- def _try_get_sft_eos (self , tokenizer ):
2182
- unused_145_list = tokenizer .Encode ('[UNUSED_TOKEN_145]' )
2183
- im_end_list = tokenizer .Encode ('<|im_end|>' )
2184
- eos_token = None
2185
- assert (len (unused_145_list ) == 1 ) ^ (len (im_end_list ) == 1 )
2186
- if len (unused_145_list ) == 1 :
2187
- eos_token = unused_145_list [0 ]
2188
- if len (im_end_list ) == 1 :
2189
- eos_token = im_end_list [0 ]
2190
- assert eos_token
2191
- return eos_token
2192
-
2193
2225
def _hf_permute_qk (self , weights , n_head : int , n_head_kv : int ):
2194
2226
if n_head_kv is not None and n_head != n_head_kv :
2195
2227
n_head = n_head_kv
@@ -2208,6 +2240,10 @@ def set_gguf_parameters(self):
2208
2240
self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["rms_norm_eps" ])
2209
2241
self .gguf_writer .add_head_count_kv (self .hparams ["num_key_value_heads" ])
2210
2242
self .gguf_writer .add_file_type (self .ftype )
2243
+ if self .hparams .get ("rope_scaling" ) is not None and "factor" in self .hparams ["rope_scaling" ]:
2244
+ if self .hparams ["rope_scaling" ].get ("type" ) == "linear" :
2245
+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .LINEAR )
2246
+ self .gguf_writer .add_rope_scaling_factor (self .hparams ["rope_scaling" ]["factor" ])
2211
2247
2212
2248
def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2213
2249
num_heads = self .hparams ["num_attention_heads" ]
0 commit comments