Skip to content

Commit bb7138c

Browse files
fix: 🐛 fix conversion of client_position into offset_at_position
`offset_at_position` was incorrectly using code units instead of code points, leading to incorrect offset calculations for UTF-[8|16]. `client_num_units` returns code units, but the correct calculation needs to use code points, as returned by `len()`. This bug affects UTF-16 only in cases where code points exceeding the basic multilingual plane are present in lines preceding the given position. Adapt SAMPLE_STRING to cover this case. Introduce a test that checks all existent client positions offset calculations in SAMPLE_STRING using an actually encoded string for comparison. Co-authored-by: Linus Heckemann <git@sphalerite.org>
1 parent bcde840 commit bb7138c

3 files changed

Lines changed: 43 additions & 4 deletions

File tree

pygls/workspace/position_codec.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -204,10 +204,10 @@ def range_to_client_units(
204204
lines (sequence):
205205
The content of the document which the range refers to.
206206
range (Range):
207-
The line and character offset in code units.
207+
The line and character offset in code points.
208208
209209
Returns:
210-
The range with `character` offsets being converted to UTF-[32|16|8] code units.
210+
The range with `character` offsets converted to UTF-[32|16|8] code units.
211211
"""
212212
return types.Range(
213213
start=self.position_to_client_units(lines, range.start),

pygls/workspace/text_document.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,14 +167,23 @@ def lines(self) -> Sequence[str]:
167167
return tuple(self.source.splitlines(True))
168168

169169
def offset_at_position(self, client_position: types.Position) -> int:
170-
"""Return the character offset pointed at by the given client_position."""
170+
"""
171+
Convert client_position to an index into self.source.
172+
173+
The index is the number of code points preceding the client_position in self.source.
174+
175+
Example in a code action request handler:
176+
selected_string = document.source[
177+
document.offset_at_position(params.range.start) : document.offset_at_position(params.range.end)
178+
]
179+
"""
171180
lines = self.lines
172181
server_position = self._position_codec.position_from_client_units(
173182
lines, client_position
174183
)
175184
row, col = server_position.line, server_position.character
176185
return col + sum(
177-
self._position_codec.client_num_units(line) for line in lines[:row]
186+
len(line) for line in lines[:row]
178187
)
179188

180189
@property

tests/test_document.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,12 @@ def test_document_source_unicode():
179179
"\u0061\u0308" # Same letter but decomposed to "a" and combining diaeresis (NFD) -- 2 utf-16 code units, 3 utf-8 code units
180180
"錯誤" # Characters in BMP but with 3-byte utf-8 encodings -- 2 utf-16 code units, 2x3 utf-8 code units
181181
"😋" # Emoji (outside Basic Multilingual Plane) -- one codepoint, 2 utf-16 codepoints, 4 utf-8 codepoints
182+
"\n" # To trigger offset calculation bugs in multiline documents
183+
# and all that again
184+
"\u00e4"
185+
"\u0061\u0308"
186+
"錯誤"
187+
"😋"
182188
)
183189

184190
CODECS = (
@@ -344,6 +350,30 @@ def test_range_to_utf16():
344350
)
345351
assert actual == expected
346352

353+
@pytest.mark.parametrize(
354+
["position_codec", "codec_name", "code_unit_size"],
355+
CODECS,
356+
)
357+
def test_offset_at_position(position_codec, codec_name, code_unit_size):
358+
document = TextDocument(
359+
DOC_URI,
360+
SAMPLE_STRING,
361+
position_codec=position_codec,
362+
)
363+
# Test all existing positions in the document
364+
expected_offset = 0
365+
for line_index, line in enumerate(document.lines):
366+
for code_point_index, code_point in enumerate(line):
367+
# The encoded partial line is needed to calculate the number of code units in the respective encoding
368+
# The python encode method is assumed to be correct and therefore used as a reference
369+
partial_line_encoded = line[:code_point_index].encode(codec_name)
370+
client_position = types.Position(line=line_index, character=len(partial_line_encoded) // code_unit_size)
371+
offset = document.offset_at_position(client_position)
372+
assert document.source[offset] == code_point
373+
assert offset == expected_offset
374+
# When iterating over the code points of document.source,
375+
# the correct offset is always one more than in the previous iteration
376+
expected_offset += 1
347377

348378
def test_offset_at_position_utf16():
349379
doc = TextDocument(DOC_URI, DOC)

0 commit comments

Comments
 (0)