fix: 🐛 fix conversion of client_position into offset_at_position

wolfskaempf · lheckemann · wolfskaempf · commit bb7138cb1319 · 2025-07-16T13:27:32.000+02:00
`offset_at_position` was incorrectly using code units instead of code
points, leading to incorrect offset calculations for UTF-[8|16].

`client_num_units` returns code units, but the correct calculation needs
to use code points, as returned by `len()`.

This bug affects UTF-16 only in cases where code points exceeding
the basic multilingual plane are present in lines preceding the given
position.

Adapt SAMPLE_STRING to cover this case.

Introduce a test that checks all existent client positions offset
calculations in SAMPLE_STRING using an actually encoded string for
comparison.

Co-authored-by: Linus Heckemann &lt;git@sphalerite.org&gt;
diff --git a/pygls/workspace/position_codec.py b/pygls/workspace/position_codec.py
@@ -204,10 +204,10 @@ def range_to_client_units(
             lines (sequence):
                 The content of the document which the range refers to.
             range (Range):
-                The line and character offset in  code units.
+                The line and character offset in code points.
 
         Returns:
-            The range with `character` offsets being converted to UTF-[32|16|8] code units.
+            The range with `character` offsets converted to UTF-[32|16|8] code units.
         """
         return types.Range(
             start=self.position_to_client_units(lines, range.start),
diff --git a/pygls/workspace/text_document.py b/pygls/workspace/text_document.py
@@ -167,14 +167,23 @@ def lines(self) -> Sequence[str]:
         return tuple(self.source.splitlines(True))
 
     def offset_at_position(self, client_position: types.Position) -> int:
-        """Return the character offset pointed at by the given client_position."""
+        """
+        Convert client_position to an index into self.source.
+
+        The index is the number of code points preceding the client_position in self.source.
+
+        Example in a code action request handler:
+            selected_string = document.source[
+                document.offset_at_position(params.range.start) : document.offset_at_position(params.range.end)
+            ]
+        """
         lines = self.lines
         server_position = self._position_codec.position_from_client_units(
             lines, client_position
         )
         row, col = server_position.line, server_position.character
         return col + sum(
-            self._position_codec.client_num_units(line) for line in lines[:row]
+            len(line) for line in lines[:row]
         )
 
     @property
diff --git a/tests/test_document.py b/tests/test_document.py
@@ -179,6 +179,12 @@ def test_document_source_unicode():
     "\u0061\u0308"  # Same letter but decomposed to "a" and combining diaeresis (NFD) -- 2 utf-16 code units, 3 utf-8 code units
     "錯誤"  # Characters in BMP but with 3-byte utf-8 encodings -- 2 utf-16 code units, 2x3 utf-8 code units
     "😋"  # Emoji (outside Basic Multilingual Plane) -- one codepoint, 2 utf-16 codepoints, 4 utf-8 codepoints
+    "\n"  # To trigger offset calculation bugs in multiline documents
+    # and all that again
+    "\u00e4"  
+    "\u0061\u0308"
+    "錯誤"
+    "😋"
 )
 
 CODECS = (
@@ -344,6 +350,30 @@ def test_range_to_utf16():
     )
     assert actual == expected
 
+@pytest.mark.parametrize(
+    ["position_codec", "codec_name", "code_unit_size"],
+    CODECS,
+)
+def test_offset_at_position(position_codec, codec_name, code_unit_size):
+    document = TextDocument(
+        DOC_URI,
+        SAMPLE_STRING,
+        position_codec=position_codec,
+    )
+    # Test all existing positions in the document
+    expected_offset = 0
+    for line_index, line in enumerate(document.lines):
+        for code_point_index, code_point in enumerate(line):
+            # The encoded partial line is needed to calculate the number of code units in the respective encoding
+            # The python encode method is assumed to be correct and therefore used as a reference
+            partial_line_encoded = line[:code_point_index].encode(codec_name)
+            client_position = types.Position(line=line_index, character=len(partial_line_encoded) // code_unit_size)
+            offset = document.offset_at_position(client_position)
+            assert document.source[offset] == code_point
+            assert offset == expected_offset
+            # When iterating over the code points of document.source,
+            # the correct offset is always one more than in the previous iteration
+            expected_offset += 1
 
 def test_offset_at_position_utf16():
     doc = TextDocument(DOC_URI, DOC)