@@ -2145,6 +2145,9 @@ def set_vocab(self):
2145
2145
toktype = SentencePieceTokenTypes .UNUSED
2146
2146
elif tokenizer .IsByte (token_id ):
2147
2147
toktype = SentencePieceTokenTypes .BYTE
2148
+ # take care of ununsed raw token
2149
+ if piece .startswith ('[UNUSED' ):
2150
+ toktype = SentencePieceTokenTypes .UNKNOWN
2148
2151
2149
2152
tokens .append (text )
2150
2153
scores .append (score )
@@ -2160,6 +2163,49 @@ def set_vocab(self):
2160
2163
scores .append (- 1000.0 )
2161
2164
toktypes .append (SentencePieceTokenTypes .USER_DEFINED )
2162
2165
2166
+ chat_eos_token = '<|im_end|>'
2167
+ chat_eos_token_id = None
2168
+
2169
+ tokenizer_config_file = self .dir_model / 'tokenizer_config.json'
2170
+ if tokenizer_config_file .is_file ():
2171
+ with open (tokenizer_config_file , "r" , encoding = "utf-8" ) as f :
2172
+ tokenizer_config_json = json .load (f )
2173
+ added_tokens_decoder = tokenizer_config_json .get ("added_tokens_decoder" , {})
2174
+ for token_id , foken_data in added_tokens_decoder .items ():
2175
+ token_id = int (token_id )
2176
+ token = foken_data ["content" ]
2177
+ if token == chat_eos_token :
2178
+ chat_eos_token_id = token_id
2179
+ token = token .encode ("utf-8" )
2180
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
2181
+ assert (tokens [token_id ] == token )
2182
+ tokens [token_id ] = token
2183
+ scores [token_id ] = - 1000.0
2184
+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
2185
+ if foken_data .get ("special" ):
2186
+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
2187
+ if foken_data ["content" ] == '<|im_end|>' :
2188
+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
2189
+
2190
+ tokenizer_file = self .dir_model / 'tokenizer.json'
2191
+ if tokenizer_file .is_file ():
2192
+ with open (tokenizer_file , "r" , encoding = "utf-8" ) as f :
2193
+ tokenizer_json = json .load (f )
2194
+ added_tokens = tokenizer_json .get ("added_tokens" , [])
2195
+ for foken_data in added_tokens :
2196
+ token_id = int (foken_data ["id" ])
2197
+ token = foken_data ["content" ]
2198
+ if token == chat_eos_token :
2199
+ chat_eos_token_id = token_id
2200
+ token = token .encode ("utf-8" )
2201
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
2202
+ assert (tokens [token_id ] == token )
2203
+ tokens [token_id ] = token
2204
+ scores [token_id ] = - 1000.0
2205
+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
2206
+ if foken_data .get ("special" ):
2207
+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
2208
+
2163
2209
self .gguf_writer .add_tokenizer_model ("llama" )
2164
2210
self .gguf_writer .add_tokenizer_pre ("default" )
2165
2211
self .gguf_writer .add_token_list (tokens )
@@ -2169,28 +2215,16 @@ def set_vocab(self):
2169
2215
2170
2216
special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
2171
2217
old_eos = special_vocab .special_token_ids ["eos" ]
2172
- if "chat" in os . path . basename ( self . dir_model . absolute ()) :
2218
+ if chat_eos_token_id is not None :
2173
2219
# For the chat model, we replace the eos with '<|im_end|>'.
2174
2220
# TODO: this is a hack, should be fixed
2175
2221
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
2176
- special_vocab .special_token_ids ["eos" ] = self . _try_get_sft_eos ( tokenizer )
2177
- logger .warning (f"Replace eos:{ old_eos } with a special token:{ special_vocab . special_token_ids [ 'eos' ] } \
2178
- in chat mode so that the conversation can end normally." )
2222
+ special_vocab .special_token_ids ["eos" ] = chat_eos_token_id
2223
+ logger .warning (f"Replace eos:{ old_eos } with a special token:{ chat_eos_token_id } "
2224
+ " in chat mode so that the conversation can end normally." )
2179
2225
2180
2226
special_vocab .add_to_gguf (self .gguf_writer )
2181
2227
2182
- def _try_get_sft_eos (self , tokenizer ):
2183
- unused_145_list = tokenizer .Encode ('[UNUSED_TOKEN_145]' )
2184
- im_end_list = tokenizer .Encode ('<|im_end|>' )
2185
- eos_token = None
2186
- assert (len (unused_145_list ) == 1 ) ^ (len (im_end_list ) == 1 )
2187
- if len (unused_145_list ) == 1 :
2188
- eos_token = unused_145_list [0 ]
2189
- if len (im_end_list ) == 1 :
2190
- eos_token = im_end_list [0 ]
2191
- assert eos_token
2192
- return eos_token
2193
-
2194
2228
def _hf_permute_qk (self , weights , n_head : int , n_head_kv : int ):
2195
2229
if n_head_kv is not None and n_head != n_head_kv :
2196
2230
n_head = n_head_kv
@@ -2209,7 +2243,11 @@ def set_gguf_parameters(self):
2209
2243
self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["rms_norm_eps" ])
2210
2244
self .gguf_writer .add_head_count_kv (self .hparams ["num_key_value_heads" ])
2211
2245
self .gguf_writer .add_file_type (self .ftype )
2212
-
2246
+ if self .hparams .get ("rope_scaling" ) is not None and "factor" in self .hparams ["rope_scaling" ]:
2247
+ if self .hparams ["rope_scaling" ].get ("type" ) == "linear" :
2248
+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .LINEAR )
2249
+ self .gguf_writer .add_rope_scaling_factor (self .hparams ["rope_scaling" ]["factor" ])
2250
+
2213
2251
def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2214
2252
num_heads = self .hparams ["num_attention_heads" ]
2215
2253
num_kv_heads = self .hparams ["num_key_value_heads" ]
0 commit comments