@@ -161,14 +161,19 @@ def _cell_len(text: str, unicode_version: str) -> int:
161161def split_graphemes (
162162 text : str , unicode_version : str = "auto"
163163) -> "tuple[list[CellSpan], int]" :
164- """Divide text into spans that define a single grapheme.
164+ """Divide text into spans that define a single grapheme, and additionally return the cell length of the whole string.
165+
166+ The returned spans will cover every index in the string, with no gaps. It is possible for some graphemes to have a cell length of zero.
167+ This can occur for nonsense strings like two zero width joiners, or for control codes that don't contribute to the grapheme size.
165168
166169 Args:
167170 text: String to split.
168171 unicode_version: Unicode version, `"auto"` to auto detect, `"latest"` for the latest unicode version.
169172
170173 Returns:
171- List of spans.
174+ A tuple of a list of *spans* and the cell length of the entire string. A span is a list of tuples
175+ of three values consisting of (<START>, <END>, <CELL LENGTH>), where START and END are string indices,
176+ and CELL LENGTH is the cell length of the single grapheme.
172177 """
173178
174179 cell_table = load_cell_table (unicode_version )
@@ -181,32 +186,48 @@ def split_graphemes(
181186 SPECIAL = {"\u200d " , "\ufe0f " }
182187 while index < codepoint_count :
183188 if (character := text [index ]) in SPECIAL :
189+ if not spans :
190+ # ZWJ or variation selector at the beginning of the string doesn't really make sense.
191+ # But handle it, we must.
192+ spans .append ((index , index := index + 1 , 0 ))
193+ continue
184194 if character == "\u200d " :
185195 # zero width joiner
186- index += 2
187- if spans :
188- start , _end , cell_length = spans [- 1 ]
189- spans [- 1 ] = (start , index , cell_length )
190- elif last_measured_character :
196+ # The condition handles the case where a ZWJ is at the end of the string, and has nothing to join
197+ index += 2 if index < ( codepoint_count - 1 ) else 1
198+ start , _end , cell_length = spans [- 1 ]
199+ spans [- 1 ] = (start , index , cell_length )
200+ else :
191201 # variation selector 16
192202 index += 1
193- if spans :
203+ if last_measured_character :
194204 start , _end , cell_length = spans [- 1 ]
195205 if last_measured_character in cell_table .narrow_to_wide :
196206 last_measured_character = None
197207 cell_length += 1
198208 total_width += 1
199209 spans [- 1 ] = (start , index , cell_length )
210+ else :
211+ # No previous character to change the size of.
212+ # Shouldn't occur in practice.
213+ # But handle it, we must.
214+ start , _end , cell_length = spans [- 1 ]
215+ spans [- 1 ] = (start , index , cell_length )
200216 continue
201217
202218 if character_width := get_character_cell_size (character , unicode_version ):
203219 last_measured_character = character
204220 spans .append ((index , index := index + 1 , character_width ))
205221 total_width += character_width
206- elif spans :
207- # zero width characters are associated with the previous character
208- start , _end , cell_length = spans [- 1 ]
209- spans [- 1 ] = (start , index := index + 1 , cell_length )
222+ else :
223+ # Character has zero width
224+ if spans :
225+ # zero width characters are associated with the previous character
226+ start , _end , cell_length = spans [- 1 ]
227+ spans [- 1 ] = (start , index := index + 1 , cell_length )
228+ else :
229+ # A zero width character with no prior spans
230+ spans .append ((index , index := index + 1 , 0 ))
210231
211232 return (spans , total_width )
212233
0 commit comments