openlawlibrary · tombh · Jul 11, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/pygls/workspace/position_codec.py b/pygls/workspace/position_codec.py
@@ -33,6 +33,9 @@ def __init__(
     ):
         self.encoding = encoding
 
+    def __repr__(self):
+        return f"<{self.__class__.__name__}, encoding {self.encoding}>"
+
     @classmethod
     def is_char_beyond_multilingual_plane(cls, char: str) -> bool:
         return ord(char) > 0xFFFF
@@ -46,6 +49,16 @@ def utf16_unit_offset(self, chars: str):
         """
         return sum(self.is_char_beyond_multilingual_plane(ch) for ch in chars)
 
+    def utf8_bytes(self, char: str) -> int:
+        codepoint = ord(char)
+        if codepoint < 0x80:
+            return 1
+        if codepoint < 0x800:
+            return 2
+        if codepoint < 0x10000:
+            return 3
+        return 4
+
     def client_num_units(self, chars: str):
         """
         Calculate the length of `str` in client-supported UTF-[32|16|8] code units.
@@ -58,7 +71,7 @@ def client_num_units(self, chars: str):
             return utf32_units
 
         if self.encoding == types.PositionEncodingKind.Utf8:
-            return utf32_units + (self.utf16_unit_offset(chars) * 2)
+            return sum(self.utf8_bytes(c) for c in chars)
 
         return utf32_units + self.utf16_unit_offset(chars)
 
@@ -120,15 +133,12 @@ def position_from_client_units(
                 break
 
             _current_char = _line[utf32_index]
-            _is_double_width = PositionCodec.is_char_beyond_multilingual_plane(
-                _current_char
-            )
-            if _is_double_width:
-                if self.encoding == types.PositionEncodingKind.Utf32:
-                    _client_index += 1
-                if self.encoding == types.PositionEncodingKind.Utf8:
-                    _client_index += 4
-                _client_index += 2
+            if self.encoding == types.PositionEncodingKind.Utf8:
+                _client_index += self.utf8_bytes(_current_char)
+            elif self.encoding == types.PositionEncodingKind.Utf16:
+                _client_index += (
+                    2 if self.is_char_beyond_multilingual_plane(_current_char) else 1
+                )
             else:
                 _client_index += 1
             utf32_index += 1

diff --git a/tests/test_document.py b/tests/test_document.py
@@ -20,6 +20,7 @@
 
 from lsprotocol import types
 from pygls.workspace import TextDocument, PositionCodec
+import pytest
 from .conftest import DOC, DOC_URI
 
 
@@ -173,6 +174,64 @@ def test_document_source_unicode():
     assert isinstance(document_mem.source, type(document_disk.source))
 
 
+SAMPLE_STRING = (
+    "\u00e4"  # Non-ASCII character (latin small letter a with diaeresis) -- one utf-16 code units, 2 utf-8 code units
+    "\u0061\u0308"  # Same letter but decomposed to "a" and combining diaeresis (NFD) -- 2 utf-16 code units, 3 utf-8 code units
+    "錯誤"  # Characters in BMP but with 3-byte utf-8 encodings -- 2 utf-16 code units, 2x3 utf-8 code units
+    "😋"  # Emoji (outside Basic Multilingual Plane) -- one codepoint, 2 utf-16 codepoints, 4 utf-8 codepoints
+)
+
+CODECS = (
+    # use explicit little-endian encodings that don't emit a byte order mark
+    (PositionCodec(encoding=types.PositionEncodingKind.Utf32), "utf-32-le", 4),
+    (PositionCodec(encoding=types.PositionEncodingKind.Utf16), "utf-16-le", 2),
+    (PositionCodec(encoding=types.PositionEncodingKind.Utf8), "utf-8", 1),
+)
+
+
+@pytest.mark.parametrize(
+    ["position_codec", "codec_name", "code_unit_size"],
+    CODECS,
+)
+def test_length_consistency(position_codec, codec_name, code_unit_size):
+    # Test that the string codec and the position codec agree on how long the encoded string is
+    assert (
+        len(SAMPLE_STRING.encode(codec_name))
+        == position_codec.client_num_units(SAMPLE_STRING) * code_unit_size
+    )
+
+    # and that they agree going through codepoint by codepoint as well (to avoid off-by-ones cancelling each other out)
+    for end in range(len(SAMPLE_STRING) + 1):
+        sliced = SAMPLE_STRING[:end]
+        encoded = sliced.encode(codec_name)
+        assert position_codec.client_num_units(sliced) * code_unit_size == len(encoded)
+
+
+@pytest.mark.parametrize(
+    ["position_codec", "codec_name", "code_unit_size"],
+    CODECS,
+)
+def test_encoding_position_consistency(position_codec, codec_name, code_unit_size):
+    encoded = SAMPLE_STRING.encode(codec_name)
+    for column in range(len(SAMPLE_STRING)):
+        utf32_pos = types.Position(line=0, character=column)
+        client_pos = position_codec.position_to_client_units([SAMPLE_STRING], utf32_pos)
+        # The position that we get from position_to_client_units
+        # should be the right number of code units to cut off the
+        # beginning in order to get back the same string. This
+        # assertion ensures both that the codecs agree on the offsets,
+        # and that we don't emit positions in the middle of a
+        # multi-code-unit codepoint (since then the `decode` would fail).
+        byte_pos = client_pos.character * code_unit_size
+        assert encoded[byte_pos:].decode(codec_name) == SAMPLE_STRING[column:]
+
+        # And the conversion should roundtrip.
+        assert (
+            position_codec.position_from_client_units([SAMPLE_STRING], client_pos)
+            == utf32_pos
+        )
+
+
 def test_position_from_utf16():
     codec = PositionCodec(encoding=types.PositionEncodingKind.Utf16)
     assert codec.position_from_client_units(
@@ -233,7 +292,7 @@ def test_position_to_utf8():
 
     assert codec.position_to_client_units(
         ['x="😋"'], types.Position(line=0, character=4)
-    ) == types.Position(line=0, character=6)
+    ) == types.Position(line=0, character=7)
 
 
 def test_range_from_utf16():
@@ -316,7 +375,7 @@ def test_offset_at_position_utf8():
         position_codec=PositionCodec(encoding=types.PositionEncodingKind.Utf8),
     )
     assert doc.offset_at_position(types.Position(line=0, character=8)) == 8
-    assert doc.offset_at_position(types.Position(line=5, character=0)) == 41
+    assert doc.offset_at_position(types.Position(line=5, character=0)) == 42
 
 
 def test_utf16_to_utf32_position_cast():