|
| 1 | +# SPDX-License-Identifier: Apache-2.0 |
| 2 | +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project |
| 3 | + |
| 4 | +from transformers import AutoTokenizer |
| 5 | + |
| 6 | +from vllm.sampling_params import SamplingParams |
| 7 | +from vllm.v1.engine import EngineCoreRequest |
| 8 | +from vllm.v1.engine.detokenizer import IncrementalDetokenizer |
| 9 | + |
| 10 | +# ruff: noqa: E501 |
| 11 | + |
| 12 | + |
| 13 | +def test_fast_inc_detok_invalid_utf8_err_case(): |
| 14 | + """ |
| 15 | + Test edge case where tokenizer can produce non-monotonic, |
| 16 | + invalid UTF-8 output, which breaks the internal state of |
| 17 | + tokenizers' DecodeStream. |
| 18 | + See https://github.com/vllm-project/vllm/issues/17448. |
| 19 | +
|
| 20 | + Thanks to reproducer from @fpaupier: |
| 21 | + https://gist.github.com/fpaupier/0ed1375bd7633c5be6c894b1c7ac1be3. |
| 22 | + """ |
| 23 | + tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it") |
| 24 | + |
| 25 | + # Create a test request |
| 26 | + prompt_token_ids = [107, 4606, 236787, 107] |
| 27 | + params = SamplingParams(skip_special_tokens=True) |
| 28 | + request = EngineCoreRequest( |
| 29 | + "test", |
| 30 | + prompt_token_ids, |
| 31 | + None, |
| 32 | + None, |
| 33 | + None, |
| 34 | + params, |
| 35 | + None, |
| 36 | + 0.0, |
| 37 | + None, |
| 38 | + cache_salt=None, |
| 39 | + data_parallel_rank=None, |
| 40 | + ) |
| 41 | + |
| 42 | + detokenizer = IncrementalDetokenizer.from_new_request(tokenizer, request) |
| 43 | + |
| 44 | + assert detokenizer.__class__.__name__ == "FastIncrementalDetokenizer", \ |
| 45 | + "Should use FastIncrementalDetokenizer by default" |
| 46 | + |
| 47 | + # Process tokens incrementally |
| 48 | + test_tokens = [ |
| 49 | + 236840, 107, 138, 236782, 107, 140, 236775, 6265, 1083, 623, 121908, |
| 50 | + 147418, 827, 107, 140, 236775, 6265, 236779, 2084, 1083, 623, 203292, |
| 51 | + 827, 107, 140, 236775, 6265, 236779, 7777, 1083, 623, 121908, 147418, |
| 52 | + 569, 537, 236789, 65880, 569, 537, 236789, 62580, 853, 115693, 210118, |
| 53 | + 35178, 16055, 1270, 759, 215817, 4758, 1925, 1117, 827, 107, 140, |
| 54 | + 236775, 5654, 1083, 623, 110733, 46291, 827, 107, 140, 236775, 5654, |
| 55 | + 236779, 2084, 1083, 623, 136955, 56731, 827, 107, 140, 236775, 5654, |
| 56 | + 236779, 7777, 1083, 623, 194776, 2947, 496, 109811, 1608, 890, 215817, |
| 57 | + 4758, 1925, 1117, 2789, 432, 398, 602, 31118, 569, 124866, 134772, 509, |
| 58 | + 19478, 1640, 33779, 236743, 236770, 236819, 236825, 236771, 432, 398, |
| 59 | + 432, 237167, 827, 107, 140, 236775, 77984, 1083, 623, 2709, 236745, |
| 60 | + 2555, 513, 236789, 602, 31118, 569 |
| 61 | + ] |
| 62 | + |
| 63 | + output = "" |
| 64 | + for i, token_id in enumerate(test_tokens): |
| 65 | + detokenizer.update([token_id], False) |
| 66 | + |
| 67 | + finished = i == len(test_tokens) - 1 |
| 68 | + output += detokenizer.get_next_output_text(finished, delta=True) |
| 69 | + |
| 70 | + |
| 71 | +# fmt: off |
| 72 | + assert output == r'''[ |
| 73 | + { |
| 74 | + "source": "Résultats", |
| 75 | + "source_type": "CONCEPT", |
| 76 | + "source_description": "Résultats de l'analyse de l'impact des opérations israéliennes sur la frontière libanaise", |
| 77 | + "target": "Israël", |
| 78 | + "target_type": "ORGANIZATION", |
| 79 | + "target_description": "Pays qui a obtenu à sa frontière libanaise « un niveau de calme inédit depuis les années 1960 »", |
| 80 | + "relationship": "Obtention d'un niveau de''' |
0 commit comments