Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/transformers/tokenization_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2110,7 +2110,7 @@ def from_pretrained(
if "tokenizer_file" in vocab_files and not re.search(vocab_files["tokenizer_file"], "".join(remote_files)):
# mistral tokenizer names are different, but we can still convert them if
# mistral common is not there
other_pattern = re.escape("tekken.json|tokenizer.model.*")
other_pattern = r"tekken\.json|tokenizer\.model\.*"
if match := re.search(other_pattern, "\n".join(remote_files)):
vocab_files["vocab_file"] = match.group()

Expand Down
11 changes: 11 additions & 0 deletions tests/models/auto/test_tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,12 @@
import sys
import tempfile
import unittest
import importlib
from pathlib import Path
from unittest import mock

import pytest
from huggingface_hub import hf_hub_download

import transformers
from transformers import (
Expand Down Expand Up @@ -181,6 +184,14 @@ def test_from_pretrained_use_fast_toggle(self):
)
self.assertIsInstance(AutoTokenizer.from_pretrained("google-bert/bert-base-cased"), BertTokenizerFast)

@require_tokenizers
def test_voxtral_tokenizer_converts_from_tekken(self):
repo_id = "mistralai/Voxtral-Mini-3B-2507"
tokenizer = AutoTokenizer.from_pretrained(repo_id) # should not raise
self.assertIsInstance(tokenizer, PreTrainedTokenizerFast)
self.assertTrue(tokenizer.is_fast)
self.assertGreater(len(tokenizer("Voxtral")["input_ids"]), 0)

@require_tokenizers
def test_do_lower_case(self):
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased", do_lower_case=False)
Expand Down
Loading