Merge pull request #4006 from Textualize/fix-grapheme-stuck

willmcgugan · web-flow · commit 7338cb9dafd0 · 2026-02-19T17:04:17.000Z
fix for infinite loop in split_graphemes
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## Unreleased
+
+### Fixed
+
+- Fixed infinite loop with `cells.split_graphemes`
+
 ## [14.3.2] - 2026-02-01
 
 ### Fixed
diff --git a/rich/cells.py b/rich/cells.py
@@ -161,14 +161,19 @@ def _cell_len(text: str, unicode_version: str) -> int:
 def split_graphemes(
     text: str, unicode_version: str = "auto"
 ) -> "tuple[list[CellSpan], int]":
-    """Divide text into spans that define a single grapheme.
+    """Divide text into spans that define a single grapheme, and additionally return the cell length of the whole string.
+
+    The returned spans will cover every index in the string, with no gaps. It is possible for some graphemes to have a cell length of zero.
+    This can occur for nonsense strings like two zero width joiners, or for control codes that don't contribute to the grapheme size.
 
     Args:
         text: String to split.
         unicode_version: Unicode version, `"auto"` to auto detect, `"latest"` for the latest unicode version.
 
     Returns:
-        List of spans.
+        A tuple of a list of *spans* and the cell length of the entire string. A span is a list of tuples
+            of three values consisting of (<START>, <END>, <CELL LENGTH>), where START and END are string indices,
+            and CELL LENGTH is the cell length of the single grapheme.
     """
 
     cell_table = load_cell_table(unicode_version)
@@ -181,32 +186,48 @@ def split_graphemes(
     SPECIAL = {"\u200d", "\ufe0f"}
     while index < codepoint_count:
         if (character := text[index]) in SPECIAL:
+            if not spans:
+                # ZWJ or variation selector at the beginning of the string doesn't really make sense.
+                # But handle it, we must.
+                spans.append((index, index := index + 1, 0))
+                continue
             if character == "\u200d":
                 # zero width joiner
-                index += 2
-                if spans:
-                    start, _end, cell_length = spans[-1]
-                    spans[-1] = (start, index, cell_length)
-            elif last_measured_character:
+                # The condition handles the case where a ZWJ is at the end of the string, and has nothing to join
+                index += 2 if index < (codepoint_count - 1) else 1
+                start, _end, cell_length = spans[-1]
+                spans[-1] = (start, index, cell_length)
+            else:
                 # variation selector 16
                 index += 1
-                if spans:
+                if last_measured_character:
                     start, _end, cell_length = spans[-1]
                     if last_measured_character in cell_table.narrow_to_wide:
                         last_measured_character = None
                         cell_length += 1
                         total_width += 1
                     spans[-1] = (start, index, cell_length)
+                else:
+                    # No previous character to change the size of.
+                    # Shouldn't occur in practice.
+                    # But handle it, we must.
+                    start, _end, cell_length = spans[-1]
+                    spans[-1] = (start, index, cell_length)
             continue
 
         if character_width := get_character_cell_size(character, unicode_version):
             last_measured_character = character
             spans.append((index, index := index + 1, character_width))
             total_width += character_width
-        elif spans:
-            # zero width characters are associated with the previous character
-            start, _end, cell_length = spans[-1]
-            spans[-1] = (start, index := index + 1, cell_length)
+        else:
+            # Character has zero width
+            if spans:
+                # zero width characters are associated with the previous character
+                start, _end, cell_length = spans[-1]
+                spans[-1] = (start, index := index + 1, cell_length)
+            else:
+                # A zero width character with no prior spans
+                spans.append((index, index := index + 1, 0))
 
     return (spans, total_width)
 
diff --git a/tests/test_cells.py b/tests/test_cells.py
@@ -134,6 +134,21 @@ def test_chop_cells_mixed_width():
     assert chop_cells(text, 3) == ["あ1", "り2", "34", "が5", "と6", "う7", "8"]
 
 
+@pytest.mark.parametrize(
+    "text,expected",
+    [
+        ("", []),
+        ("\x1b", []),
+        ("\x1b\x1b", []),
+        ("\x1b\x1b\x1b", []),
+        ("\x1b\x1b\x1b\x1b", []),
+    ],
+)
+def test_chop_cells_zero_width(text: str, expected: list[str]) -> None:
+    """Test zer width characters being chopped."""
+    assert chop_cells(text, 3) == expected
+
+
 def test_is_single_cell_widths() -> None:
     # Check _is_single_cell_widths reports correctly
     for character in string.printable:
@@ -172,12 +187,35 @@ def test_is_single_cell_widths() -> None:
         ("♻", [(0, 1, 1)], 1),
         ("♻️", [(0, 2, 2)], 2),
         ("♻♻️", [(0, 1, 1), (1, 3, 2)], 3),
+        ("\x1b", [(0, 1, 0)], 0),  # One escape should have zero width
+        ("\x1b\x1b", [(0, 2, 0)], 0),  # Two escapes should have zero width
+        (
+            "\ufe0f",
+            [(0, 1, 0)],
+            0,
+        ),  # Variation selector 16, without anything to change should have zero width
+        (
+            "\ufe0f\ufe0f",
+            [(0, 2, 0)],
+            0,
+        ),  # 2 X variation selector 16, without anything to change should have zero width
+        (
+            "\u200d",
+            [(0, 1, 0)],
+            0,
+        ),  # A zero width joiner within noting prior should have zero width
+        (
+            "\u200d\u200d",
+            [(0, 2, 0)],
+            0,
+        ),  # Two ZWJs should have zero width
     ],
 )
 def test_split_graphemes(
     text: str, expected_spans: list[CellSpan], expected_cell_length: int
 ):
     spans, cell_length = split_graphemes(text)
+    print(spans)
     assert cell_len(text) == expected_cell_length
     assert spans == expected_spans
     assert cell_length == expected_cell_length