@@ -2143,6 +2143,9 @@ def set_vocab(self):
2143
2143
toktype = SentencePieceTokenTypes .UNUSED
2144
2144
elif tokenizer .IsByte (token_id ):
2145
2145
toktype = SentencePieceTokenTypes .BYTE
2146
+ # take care of ununsed raw token
2147
+ if piece .startswith ('[UNUSED' ):
2148
+ toktype = SentencePieceTokenTypes .UNKNOWN
2146
2149
2147
2150
tokens .append (text )
2148
2151
scores .append (score )
@@ -2158,6 +2161,47 @@ def set_vocab(self):
2158
2161
scores .append (- 1000.0 )
2159
2162
toktypes .append (SentencePieceTokenTypes .USER_DEFINED )
2160
2163
2164
+ chat_eos_token = '<|im_end|>'
2165
+ chat_eos_token_id = None
2166
+
2167
+ tokenizer_config_file = self .dir_model / 'tokenizer_config.json'
2168
+ if tokenizer_config_file .is_file ():
2169
+ with open (tokenizer_config_file , "r" , encoding = "utf-8" ) as f :
2170
+ tokenizer_config_json = json .load (f )
2171
+ added_tokens_decoder = tokenizer_config_json .get ("added_tokens_decoder" , {})
2172
+ for token_id , foken_data in added_tokens_decoder .items ():
2173
+ token_id = int (token_id )
2174
+ token = foken_data ["content" ]
2175
+ if token == chat_eos_token :
2176
+ chat_eos_token_id = token_id
2177
+ token = token .encode ("utf-8" )
2178
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
2179
+ assert (tokens [token_id ] == token )
2180
+ tokens [token_id ] = token
2181
+ scores [token_id ] = - 1000.0
2182
+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
2183
+ if foken_data .get ("special" ):
2184
+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
2185
+
2186
+ tokenizer_file = self .dir_model / 'tokenizer.json'
2187
+ if tokenizer_file .is_file ():
2188
+ with open (tokenizer_file , "r" , encoding = "utf-8" ) as f :
2189
+ tokenizer_json = json .load (f )
2190
+ added_tokens = tokenizer_json .get ("added_tokens" , [])
2191
+ for foken_data in added_tokens :
2192
+ token_id = int (foken_data ["id" ])
2193
+ token = foken_data ["content" ]
2194
+ if token == chat_eos_token :
2195
+ chat_eos_token_id = token_id
2196
+ token = token .encode ("utf-8" )
2197
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
2198
+ assert (tokens [token_id ] == token )
2199
+ tokens [token_id ] = token
2200
+ scores [token_id ] = - 1000.0
2201
+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
2202
+ if foken_data .get ("special" ):
2203
+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
2204
+
2161
2205
self .gguf_writer .add_tokenizer_model ("llama" )
2162
2206
self .gguf_writer .add_tokenizer_pre ("default" )
2163
2207
self .gguf_writer .add_token_list (tokens )
@@ -2167,28 +2211,16 @@ def set_vocab(self):
2167
2211
2168
2212
special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
2169
2213
old_eos = special_vocab .special_token_ids ["eos" ]
2170
- if "chat" in os . path . basename ( self . dir_model . absolute ()) :
2214
+ if chat_eos_token_id is not None :
2171
2215
# For the chat model, we replace the eos with '<|im_end|>'.
2172
2216
# TODO: this is a hack, should be fixed
2173
2217
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
2174
- special_vocab .special_token_ids ["eos" ] = self . _try_get_sft_eos ( tokenizer )
2175
- logger .warning (f"Replace eos:{ old_eos } with a special token:{ special_vocab . special_token_ids [ 'eos' ] } \
2176
- in chat mode so that the conversation can end normally." )
2218
+ special_vocab .special_token_ids ["eos" ] = chat_eos_token_id
2219
+ logger .warning (f"Replace eos:{ old_eos } with a special token:{ chat_eos_token_id } "
2220
+ " in chat mode so that the conversation can end normally." )
2177
2221
2178
2222
special_vocab .add_to_gguf (self .gguf_writer )
2179
2223
2180
- def _try_get_sft_eos (self , tokenizer ):
2181
- unused_145_list = tokenizer .Encode ('[UNUSED_TOKEN_145]' )
2182
- im_end_list = tokenizer .Encode ('<|im_end|>' )
2183
- eos_token = None
2184
- assert (len (unused_145_list ) == 1 ) ^ (len (im_end_list ) == 1 )
2185
- if len (unused_145_list ) == 1 :
2186
- eos_token = unused_145_list [0 ]
2187
- if len (im_end_list ) == 1 :
2188
- eos_token = im_end_list [0 ]
2189
- assert eos_token
2190
- return eos_token
2191
-
2192
2224
def _hf_permute_qk (self , weights , n_head : int , n_head_kv : int ):
2193
2225
if n_head_kv is not None and n_head != n_head_kv :
2194
2226
n_head = n_head_kv
@@ -2207,6 +2239,10 @@ def set_gguf_parameters(self):
2207
2239
self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["rms_norm_eps" ])
2208
2240
self .gguf_writer .add_head_count_kv (self .hparams ["num_key_value_heads" ])
2209
2241
self .gguf_writer .add_file_type (self .ftype )
2242
+ if self .hparams .get ("rope_scaling" ) is not None and "factor" in self .hparams ["rope_scaling" ]:
2243
+ if self .hparams ["rope_scaling" ].get ("type" ) == "linear" :
2244
+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .LINEAR )
2245
+ self .gguf_writer .add_rope_scaling_factor (self .hparams ["rope_scaling" ]["factor" ])
2210
2246
2211
2247
def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2212
2248
num_heads = self .hparams ["num_attention_heads" ]
0 commit comments