fix: Improve performance of the UTF-8 string comparison logic (#2380)

dconeybe · web-flow · commit bc6a03e2b44e · 2025-07-07T16:31:16.000-04:00
diff --git a/dev/src/order.ts b/dev/src/order.ts
@@ -254,56 +254,49 @@ function compareVectors(left: ApiMapValue, right: ApiMapValue): number {
  * @internal
  */
 export function compareUtf8Strings(left: string, right: string): number {
-  let i = 0;
-  while (i < left.length && i < right.length) {
-    const leftCodePoint = left.codePointAt(i)!;
-    const rightCodePoint = right.codePointAt(i)!;
-
-    if (leftCodePoint !== rightCodePoint) {
-      if (leftCodePoint < 128 && rightCodePoint < 128) {
-        // ASCII comparison
-        return primitiveComparator(leftCodePoint, rightCodePoint);
-      } else {
-        // Lazy instantiate TextEncoder
-        const encoder = new TextEncoder();
-
-        // UTF-8 encode the character at index i for byte comparison.
-        const leftBytes = encoder.encode(getUtf8SafeSubstring(left, i));
-        const rightBytes = encoder.encode(getUtf8SafeSubstring(right, i));
-        const comp = compareBlobs(
-          Buffer.from(leftBytes),
-          Buffer.from(rightBytes)
-        );
-        if (comp !== 0) {
-          return comp;
-        } else {
-          // EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte
-          // representations are identical. This can happen with malformed input
-          // (invalid surrogate pairs). The backend also actively prevents invalid
-          // surrogates as INVALID_ARGUMENT errors, so we almost never receive
-          // invalid strings from backend.
-          // Fallback to code point comparison for graceful handling.
-          return primitiveComparator(leftCodePoint, rightCodePoint);
-        }
-      }
+  // Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and,
+  // if found, use that character to determine the relative ordering of the two strings as a
+  // whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by
+  // comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8
+  // and UTF-16 happen to represent Unicode code points.
+  //
+  // After finding the first pair of differing characters, there are two cases:
+  //
+  // Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or
+  // both are surrogates from a surrogate pair (that collectively represent code points greater
+  // than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the
+  // lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is
+  // sufficient.
+  //
+  // Case 2: One character is a surrogate and the other is not. In this case the surrogate-
+  // containing string is always ordered after the non-surrogate. This is because surrogates are
+  // used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations
+  // and are lexicographically greater than the 1, 2, or 3-byte representations of code points
+  // less than or equal to 0xFFFF.
+  const length = Math.min(left.length, right.length);
+  for (let i = 0; i < length; i++) {
+    const leftChar = left.charAt(i);
+    const rightChar = right.charAt(i);
+    if (leftChar !== rightChar) {
+      return isSurrogate(leftChar) === isSurrogate(rightChar)
+        ? primitiveComparator(leftChar, rightChar)
+        : isSurrogate(leftChar)
+          ? 1
+          : -1;
     }
-    // Increment by 2 for surrogate pairs, 1 otherwise
-    i += leftCodePoint > 0xffff ? 2 : 1;
   }
 
-  // Compare lengths if all characters are equal
+  // Use the lengths of the strings to determine the overall comparison result since either the
+  // strings were equal or one is a prefix of the other.
   return primitiveComparator(left.length, right.length);
 }
 
-function getUtf8SafeSubstring(str: string, index: number): string {
-  const firstCodePoint = str.codePointAt(index)!;
-  if (firstCodePoint > 0xffff) {
-    // It's a surrogate pair, return the whole pair
-    return str.substring(index, index + 2);
-  } else {
-    // It's a single code point, return it
-    return str.substring(index, index + 1);
-  }
+const MIN_SURROGATE = 0xd800;
+const MAX_SURROGATE = 0xdfff;
+
+function isSurrogate(s: string): boolean {
+  const c = s.charCodeAt(0);
+  return c >= MIN_SURROGATE && c <= MAX_SURROGATE;
 }
 
 /*!