Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 20 additions & 10 deletions pygls/workspace/position_codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ def __init__(
):
self.encoding = encoding

def __repr__(self):
return f"<{self.__class__.__name__}, encoding {self.encoding}>"

@classmethod
def is_char_beyond_multilingual_plane(cls, char: str) -> bool:
return ord(char) > 0xFFFF
Expand All @@ -46,6 +49,16 @@ def utf16_unit_offset(self, chars: str):
"""
return sum(self.is_char_beyond_multilingual_plane(ch) for ch in chars)

def utf8_bytes(self, char: str) -> int:
codepoint = ord(char)
if codepoint < 0x80:
return 1
if codepoint < 0x800:
return 2
if codepoint < 0x10000:
return 3
return 4

def client_num_units(self, chars: str):
"""
Calculate the length of `str` in client-supported UTF-[32|16|8] code units.
Expand All @@ -58,7 +71,7 @@ def client_num_units(self, chars: str):
return utf32_units

if self.encoding == types.PositionEncodingKind.Utf8:
return utf32_units + (self.utf16_unit_offset(chars) * 2)
return sum(self.utf8_bytes(c) for c in chars)

return utf32_units + self.utf16_unit_offset(chars)

Expand Down Expand Up @@ -120,15 +133,12 @@ def position_from_client_units(
break

_current_char = _line[utf32_index]
_is_double_width = PositionCodec.is_char_beyond_multilingual_plane(
_current_char
)
if _is_double_width:
if self.encoding == types.PositionEncodingKind.Utf32:
_client_index += 1
if self.encoding == types.PositionEncodingKind.Utf8:
_client_index += 4
_client_index += 2
if self.encoding == types.PositionEncodingKind.Utf8:
_client_index += self.utf8_bytes(_current_char)
elif self.encoding == types.PositionEncodingKind.Utf16:
_client_index += (
2 if self.is_char_beyond_multilingual_plane(_current_char) else 1
)
else:
_client_index += 1
utf32_index += 1
Expand Down
63 changes: 61 additions & 2 deletions tests/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from lsprotocol import types
from pygls.workspace import TextDocument, PositionCodec
import pytest
from .conftest import DOC, DOC_URI


Expand Down Expand Up @@ -173,6 +174,64 @@ def test_document_source_unicode():
assert isinstance(document_mem.source, type(document_disk.source))


SAMPLE_STRING = (
"\u00e4" # Non-ASCII character (latin small letter a with diaeresis) -- one utf-16 code units, 2 utf-8 code units
"\u0061\u0308" # Same letter but decomposed to "a" and combining diaeresis (NFD) -- 2 utf-16 code units, 3 utf-8 code units
"錯誤" # Characters in BMP but with 3-byte utf-8 encodings -- 2 utf-16 code units, 2x3 utf-8 code units
"😋" # Emoji (outside Basic Multilingual Plane) -- one codepoint, 2 utf-16 codepoints, 4 utf-8 codepoints
)

CODECS = (
# use explicit little-endian encodings that don't emit a byte order mark
(PositionCodec(encoding=types.PositionEncodingKind.Utf32), "utf-32-le", 4),
(PositionCodec(encoding=types.PositionEncodingKind.Utf16), "utf-16-le", 2),
(PositionCodec(encoding=types.PositionEncodingKind.Utf8), "utf-8", 1),
)


@pytest.mark.parametrize(
["position_codec", "codec_name", "code_unit_size"],
CODECS,
)
def test_length_consistency(position_codec, codec_name, code_unit_size):
# Test that the string codec and the position codec agree on how long the encoded string is
assert (
len(SAMPLE_STRING.encode(codec_name))
== position_codec.client_num_units(SAMPLE_STRING) * code_unit_size
)

# and that they agree going through codepoint by codepoint as well (to avoid off-by-ones cancelling each other out)
for end in range(len(SAMPLE_STRING) + 1):
sliced = SAMPLE_STRING[:end]
encoded = sliced.encode(codec_name)
assert position_codec.client_num_units(sliced) * code_unit_size == len(encoded)


@pytest.mark.parametrize(
["position_codec", "codec_name", "code_unit_size"],
CODECS,
)
def test_encoding_position_consistency(position_codec, codec_name, code_unit_size):
encoded = SAMPLE_STRING.encode(codec_name)
for column in range(len(SAMPLE_STRING)):
utf32_pos = types.Position(line=0, character=column)
client_pos = position_codec.position_to_client_units([SAMPLE_STRING], utf32_pos)
# The position that we get from position_to_client_units
# should be the right number of code units to cut off the
# beginning in order to get back the same string. This
# assertion ensures both that the codecs agree on the offsets,
# and that we don't emit positions in the middle of a
# multi-code-unit codepoint (since then the `decode` would fail).
byte_pos = client_pos.character * code_unit_size
assert encoded[byte_pos:].decode(codec_name) == SAMPLE_STRING[column:]

# And the conversion should roundtrip.
assert (
position_codec.position_from_client_units([SAMPLE_STRING], client_pos)
== utf32_pos
)


def test_position_from_utf16():
codec = PositionCodec(encoding=types.PositionEncodingKind.Utf16)
assert codec.position_from_client_units(
Expand Down Expand Up @@ -233,7 +292,7 @@ def test_position_to_utf8():

assert codec.position_to_client_units(
['x="😋"'], types.Position(line=0, character=4)
) == types.Position(line=0, character=6)
) == types.Position(line=0, character=7)


def test_range_from_utf16():
Expand Down Expand Up @@ -316,7 +375,7 @@ def test_offset_at_position_utf8():
position_codec=PositionCodec(encoding=types.PositionEncodingKind.Utf8),
)
assert doc.offset_at_position(types.Position(line=0, character=8)) == 8
assert doc.offset_at_position(types.Position(line=5, character=0)) == 41
assert doc.offset_at_position(types.Position(line=5, character=0)) == 42


def test_utf16_to_utf32_position_cast():
Expand Down
Loading