-
Notifications
You must be signed in to change notification settings - Fork 31.6k
Auto convert tekken.json #42299
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Auto convert tekken.json #42299
Changes from 5 commits
1f83f14
798c29f
55c1652
5f851fe
bdcde31
fb41fe3
70e8a37
66d3b89
416f4c6
0d0484d
699fb5c
16a833f
80a80ac
70e14eb
2f292b0
56afbe2
5689423
e0be8ad
214e3cf
865318a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,7 @@ | |
| from dataclasses import dataclass | ||
| from pathlib import Path | ||
| from typing import TYPE_CHECKING, Any, Literal, NamedTuple, Optional, Union, overload | ||
| from huggingface_hub import list_repo_files | ||
|
|
||
| import numpy as np | ||
| from packaging import version | ||
|
|
@@ -150,7 +151,7 @@ def __str__(self): | |
|
|
||
| # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file | ||
| FULL_TOKENIZER_FILE = "tokenizer.json" | ||
| _re_tokenizer_file = re.compile(r"tokenizer\.(.*)\.json") | ||
| _re_tokenizer_file = re.compile(r"(tokenizer|tekken)\.(.*)\.json") | ||
|
|
||
|
|
||
| class TruncationStrategy(ExplicitEnum): | ||
|
|
@@ -2098,7 +2099,13 @@ def from_pretrained( | |
| template = template.removesuffix(".jinja") | ||
| vocab_files[f"chat_template_{template}"] = f"{CHAT_TEMPLATE_DIR}/{template}.jinja" | ||
|
|
||
| # Get files from url, cache, or disk depending on the case | ||
| remote_files = list_repo_files(pretrained_model_name_or_path) | ||
| if not re.search(vocab_files["tokenizer_file"], "".join(remote_files)): | ||
| # mistral tokenizer names are different, but we can still convert them if | ||
| # mistral common is not there | ||
| other_pattern = "tekken.json|tokenizer.model.*" | ||
| vocab_files["vocab_file"] = re.search(other_pattern, "".join(remote_files)).group() | ||
|
|
||
| resolved_vocab_files = {} | ||
| for file_id, file_path in vocab_files.items(): | ||
| if file_path is None: | ||
|
|
@@ -2417,6 +2424,22 @@ def _from_pretrained( | |
| "Special tokens have been added in the vocabulary, make sure the associated word embeddings are" | ||
| " fine-tuned or trained." | ||
| ) | ||
| if tokenizer.vocab_size > 100000 and getattr(tokenizer.backend_tokenizer, "pre_tokenizer", None) is not None: | ||
| from huggingface_hub import model_info | ||
| def is_base_mistral(model_id: str) -> bool: | ||
| model = model_info(model_id) | ||
| if model.tags is not None: | ||
| if re.search("base_model:.*mistralai", "".join(model.tags)): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm so that's only for mistral org no? Should we directly check of `model_type in ["mistral" ....] so that it also works for other orgs?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can't do that until we download the config / config is there |
||
| return True | ||
| return False | ||
ArthurZucker marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| if is_base_mistral(pretrained_model_name_or_path) and not kwargs.get("fix_regex"): | ||
| logger.warning( | ||
| f"The tokenizer you are loading from '{pretrained_model_name_or_path}'" | ||
| f" with an old regex pattern. This will lead to incorrect tokenization." | ||
ArthurZucker marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ) | ||
ArthurZucker marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| import tokenizers | ||
| tokenizer.backend_tokenizer.pre_tokenizer[0] = tokenizers.pre_tokenizers.Split(pattern=tokenizers.Regex(r"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"), behavior = "isolated") | ||
| return tokenizer | ||
|
|
||
| @staticmethod | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Usually the pattern is
(slow, fast), here it's(fast, fast), not sure if intendedMaybe it should be instead:
so that we never have slow one anyway?