Skip to content

Commit bb4c5c8

Browse files
committed
update internlm2
1 parent 148ec97 commit bb4c5c8

File tree

2 files changed

+59
-17
lines changed

2 files changed

+59
-17
lines changed

convert_hf_to_gguf.py

Lines changed: 55 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2145,6 +2145,9 @@ def set_vocab(self):
21452145
toktype = SentencePieceTokenTypes.UNUSED
21462146
elif tokenizer.IsByte(token_id):
21472147
toktype = SentencePieceTokenTypes.BYTE
2148+
# take care of ununsed raw token
2149+
if piece.startswith('[UNUSED'):
2150+
toktype = SentencePieceTokenTypes.UNKNOWN
21482151

21492152
tokens.append(text)
21502153
scores.append(score)
@@ -2160,6 +2163,49 @@ def set_vocab(self):
21602163
scores.append(-1000.0)
21612164
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
21622165

2166+
chat_eos_token = '<|im_end|>'
2167+
chat_eos_token_id = None
2168+
2169+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2170+
if tokenizer_config_file.is_file():
2171+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
2172+
tokenizer_config_json = json.load(f)
2173+
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
2174+
for token_id, foken_data in added_tokens_decoder.items():
2175+
token_id = int(token_id)
2176+
token = foken_data["content"]
2177+
if token == chat_eos_token:
2178+
chat_eos_token_id = token_id
2179+
token = token.encode("utf-8")
2180+
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
2181+
assert(tokens[token_id] == token)
2182+
tokens[token_id] = token
2183+
scores[token_id] = -1000.0
2184+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
2185+
if foken_data.get("special"):
2186+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
2187+
if foken_data["content"] == '<|im_end|>':
2188+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
2189+
2190+
tokenizer_file = self.dir_model / 'tokenizer.json'
2191+
if tokenizer_file.is_file():
2192+
with open(tokenizer_file, "r", encoding="utf-8") as f:
2193+
tokenizer_json = json.load(f)
2194+
added_tokens = tokenizer_json.get("added_tokens", [])
2195+
for foken_data in added_tokens:
2196+
token_id = int(foken_data["id"])
2197+
token = foken_data["content"]
2198+
if token == chat_eos_token:
2199+
chat_eos_token_id = token_id
2200+
token = token.encode("utf-8")
2201+
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
2202+
assert(tokens[token_id] == token)
2203+
tokens[token_id] = token
2204+
scores[token_id] = -1000.0
2205+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
2206+
if foken_data.get("special"):
2207+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
2208+
21632209
self.gguf_writer.add_tokenizer_model("llama")
21642210
self.gguf_writer.add_tokenizer_pre("default")
21652211
self.gguf_writer.add_token_list(tokens)
@@ -2169,28 +2215,16 @@ def set_vocab(self):
21692215

21702216
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
21712217
old_eos = special_vocab.special_token_ids["eos"]
2172-
if "chat" in os.path.basename(self.dir_model.absolute()):
2218+
if chat_eos_token_id is not None:
21732219
# For the chat model, we replace the eos with '<|im_end|>'.
21742220
# TODO: this is a hack, should be fixed
21752221
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
2176-
special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
2177-
logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
2178-
in chat mode so that the conversation can end normally.")
2222+
special_vocab.special_token_ids["eos"] = chat_eos_token_id
2223+
logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
2224+
" in chat mode so that the conversation can end normally.")
21792225

21802226
special_vocab.add_to_gguf(self.gguf_writer)
21812227

2182-
def _try_get_sft_eos(self, tokenizer):
2183-
unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]')
2184-
im_end_list = tokenizer.Encode('<|im_end|>')
2185-
eos_token = None
2186-
assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
2187-
if len(unused_145_list) == 1:
2188-
eos_token = unused_145_list[0]
2189-
if len(im_end_list) == 1:
2190-
eos_token = im_end_list[0]
2191-
assert eos_token
2192-
return eos_token
2193-
21942228
def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
21952229
if n_head_kv is not None and n_head != n_head_kv:
21962230
n_head = n_head_kv
@@ -2209,7 +2243,11 @@ def set_gguf_parameters(self):
22092243
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
22102244
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
22112245
self.gguf_writer.add_file_type(self.ftype)
2212-
2246+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
2247+
if self.hparams["rope_scaling"].get("type") == "linear":
2248+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
2249+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
2250+
22132251
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
22142252
num_heads = self.hparams["num_attention_heads"]
22152253
num_kv_heads = self.hparams["num_key_value_heads"]

prompts/chat-with-internlm.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
You are an AI assistant whose name is InternLM (书生·浦语).\n- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.
2+
3+
User: Hello! Who are you?
4+
InternLM:

0 commit comments

Comments
 (0)