Fix unstable tokenizer fingerprinting

Fahad Alghanim · Fahad Alghanim · commit 17927155b13e · 2026-02-04T22:41:21.000-07:00
Tokenizers backed by `tokenizers` can mutate truncation/padding state when called, which made dataset transform fingerprints unstable and prevented `.map(load_from_cache_file=True)` from reusing cached results. This change makes tokenizer hashing stable by temporarily clearing backend truncation/padding during serialization for fingerprinting, then restoring it. Add a regression test and a simple benchmark to demonstrate cache-hit speedups. Fixes #3847
diff --git a/benchmarks/benchmark_map_cache_reuse.py b/benchmarks/benchmark_map_cache_reuse.py
@@ -0,0 +1,53 @@
+import json
+import os
+import tempfile
+
+from tokenizers import Tokenizer
+from tokenizers.models import WordLevel
+from tokenizers.pre_tokenizers import Whitespace
+from transformers import PreTrainedTokenizerFast
+
+import datasets
+from utils import get_duration
+
+
+RESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__)
+RESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, "results", RESULTS_FILENAME.replace(".py", ".json"))
+
+
+def _make_tokenizer() -> PreTrainedTokenizerFast:
+    vocab = {"[UNK]": 0, "[PAD]": 1, "hello": 2, "world": 3}
+    backend = Tokenizer(WordLevel(vocab=vocab, unk_token="[UNK]"))
+    backend.pre_tokenizer = Whitespace()
+    return PreTrainedTokenizerFast(tokenizer_object=backend, unk_token="[UNK]", pad_token="[PAD]")
+
+
+@get_duration
+def map_once(dataset: datasets.Dataset, tok: PreTrainedTokenizerFast):
+    def tokenize(examples):
+        return tok(examples["text"], truncation=True, padding="max_length", max_length=8)
+
+    _ = dataset.map(tokenize, batched=True, load_from_cache_file=True, remove_columns=["text"])
+
+
+def benchmark_map_cache_reuse():
+    times = {}
+    tok = _make_tokenizer()
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        raw = datasets.Dataset.from_dict({"text": ["hello world"] * 200_000})
+        stored = os.path.join(tmp_dir, "stored")
+        raw.save_to_disk(stored)
+        dataset = datasets.Dataset.load_from_disk(stored)
+
+        # First run: cache miss (writes cache file)
+        times["map tokenize (cache miss)"] = map_once(dataset, tok)
+        # Second run: cache hit (should be much faster if fingerprint is stable)
+        times["map tokenize (cache hit)"] = map_once(dataset, tok)
+
+    with open(RESULTS_FILE_PATH, "wb") as f:
+        f.write(json.dumps(times).encode("utf-8"))
+
+
+if __name__ == "__main__":
+    benchmark_map_cache_reuse()
diff --git a/src/datasets/utils/_dill.py b/src/datasets/utils/_dill.py
@@ -212,11 +212,42 @@ def create_spacyLanguage(config, bytes):
 
 def _save_transformersPreTrainedTokenizerBase(pickler, obj):
     log(pickler, f"Tok: {obj}")
-    # Ignore the `cache` attribute
-    state = obj.__dict__
+    # Ignore the `cache` attribute and make hashing stable.
+    #
+    # Some tokenizers backed by the `tokenizers` library mutate their internal `_tokenizer` state when called
+    # (e.g. by enabling truncation/padding). This can change the serialized bytes across runs and make dataset
+    # fingerprints unstable, which prevents `.map(load_from_cache_file=True)` from reusing cache files.
+    #
+    # For hashing/fingerprinting, we temporarily disable backend truncation/padding to avoid these runtime settings
+    # affecting the fingerprint, then restore the original settings.
+    state = obj.__dict__.copy()
     if "cache" in state and isinstance(state["cache"], dict):
         state["cache"] = {}
-    pickler.save_reduce(type(obj), (), state=state, obj=obj)
+
+    backend_tokenizer = obj.__dict__.get("_tokenizer")
+    truncation = padding = None
+    if backend_tokenizer is not None and hasattr(backend_tokenizer, "truncation") and hasattr(backend_tokenizer, "padding"):
+        truncation = backend_tokenizer.truncation
+        padding = backend_tokenizer.padding
+        try:
+            if truncation is not None and hasattr(backend_tokenizer, "no_truncation"):
+                backend_tokenizer.no_truncation()
+            if padding is not None and hasattr(backend_tokenizer, "no_padding"):
+                backend_tokenizer.no_padding()
+        except Exception:
+            truncation = padding = None
+
+    try:
+        pickler.save_reduce(type(obj), (), state=state, obj=obj)
+    finally:
+        try:
+            if backend_tokenizer is not None:
+                if truncation is not None and hasattr(backend_tokenizer, "enable_truncation"):
+                    backend_tokenizer.enable_truncation(**truncation)
+                if padding is not None and hasattr(backend_tokenizer, "enable_padding"):
+                    backend_tokenizer.enable_padding(**padding)
+        except Exception:
+            pass
     log(pickler, "# Tok")
 
 
diff --git a/tests/test_fingerprint_tokenizer_stability.py b/tests/test_fingerprint_tokenizer_stability.py
@@ -0,0 +1,46 @@
+import pytest
+from tokenizers import Tokenizer
+from tokenizers.models import WordLevel
+from tokenizers.pre_tokenizers import Whitespace
+from transformers import PreTrainedTokenizerFast
+
+from datasets import Dataset
+from datasets.fingerprint import Hasher
+
+
+def _make_mutable_backend_tokenizer() -> PreTrainedTokenizerFast:
+    # Build a tiny tokenizer entirely locally (no network), backed by `tokenizers.Tokenizer`.
+    vocab = {"[UNK]": 0, "[PAD]": 1, "hello": 2, "world": 3}
+    backend = Tokenizer(WordLevel(vocab=vocab, unk_token="[UNK]"))
+    backend.pre_tokenizer = Whitespace()
+    return PreTrainedTokenizerFast(tokenizer_object=backend, unk_token="[UNK]", pad_token="[PAD]")
+
+
+def test_hasher_hash_tokenizer_stable_after_call():
+    tok = _make_mutable_backend_tokenizer()
+    h0 = Hasher.hash(tok)
+    _ = tok(["hello world"], truncation=True, padding="max_length", max_length=8)
+    h1 = Hasher.hash(tok)
+    assert h0 == h1
+
+
+def test_map_cache_reused_with_tokenizer_after_call(tmp_path):
+    # Regression test for https://github.com/huggingface/datasets/issues/3847
+    #
+    # Tokenizers can mutate backend truncation/padding state when called, which used to make the
+    # dataset transform fingerprint unstable and prevented cache reuse.
+    tok = _make_mutable_backend_tokenizer()
+
+    raw = Dataset.from_dict({"text": ["hello world"] * 1000})
+    stored = tmp_path / "stored"
+    raw.save_to_disk(stored)
+    raw = Dataset.load_from_disk(stored)
+
+    def tokenize(examples):
+        return tok(examples["text"], truncation=True, padding="max_length", max_length=8)
+
+    res1 = raw.map(tokenize, batched=True, load_from_cache_file=True, remove_columns=["text"])
+    res2 = raw.map(tokenize, batched=True, load_from_cache_file=True, remove_columns=["text"])
+
+    assert res1.cache_files and res2.cache_files
+    assert res1.cache_files[0]["filename"] == res2.cache_files[0]["filename"]