@@ -254,56 +254,49 @@ function compareVectors(left: ApiMapValue, right: ApiMapValue): number {
254254 * @internal
255255 */
256256export function compareUtf8Strings ( left : string , right : string ) : number {
257- let i = 0 ;
258- while ( i < left . length && i < right . length ) {
259- const leftCodePoint = left . codePointAt ( i ) ! ;
260- const rightCodePoint = right . codePointAt ( i ) ! ;
261-
262- if ( leftCodePoint !== rightCodePoint ) {
263- if ( leftCodePoint < 128 && rightCodePoint < 128 ) {
264- // ASCII comparison
265- return primitiveComparator ( leftCodePoint , rightCodePoint ) ;
266- } else {
267- // Lazy instantiate TextEncoder
268- const encoder = new TextEncoder ( ) ;
269-
270- // UTF-8 encode the character at index i for byte comparison.
271- const leftBytes = encoder . encode ( getUtf8SafeSubstring ( left , i ) ) ;
272- const rightBytes = encoder . encode ( getUtf8SafeSubstring ( right , i ) ) ;
273- const comp = compareBlobs (
274- Buffer . from ( leftBytes ) ,
275- Buffer . from ( rightBytes )
276- ) ;
277- if ( comp !== 0 ) {
278- return comp ;
279- } else {
280- // EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte
281- // representations are identical. This can happen with malformed input
282- // (invalid surrogate pairs). The backend also actively prevents invalid
283- // surrogates as INVALID_ARGUMENT errors, so we almost never receive
284- // invalid strings from backend.
285- // Fallback to code point comparison for graceful handling.
286- return primitiveComparator ( leftCodePoint , rightCodePoint ) ;
287- }
288- }
257+ // Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and,
258+ // if found, use that character to determine the relative ordering of the two strings as a
259+ // whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by
260+ // comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8
261+ // and UTF-16 happen to represent Unicode code points.
262+ //
263+ // After finding the first pair of differing characters, there are two cases:
264+ //
265+ // Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or
266+ // both are surrogates from a surrogate pair (that collectively represent code points greater
267+ // than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the
268+ // lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is
269+ // sufficient.
270+ //
271+ // Case 2: One character is a surrogate and the other is not. In this case the surrogate-
272+ // containing string is always ordered after the non-surrogate. This is because surrogates are
273+ // used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations
274+ // and are lexicographically greater than the 1, 2, or 3-byte representations of code points
275+ // less than or equal to 0xFFFF.
276+ const length = Math . min ( left . length , right . length ) ;
277+ for ( let i = 0 ; i < length ; i ++ ) {
278+ const leftChar = left . charAt ( i ) ;
279+ const rightChar = right . charAt ( i ) ;
280+ if ( leftChar !== rightChar ) {
281+ return isSurrogate ( leftChar ) === isSurrogate ( rightChar )
282+ ? primitiveComparator ( leftChar , rightChar )
283+ : isSurrogate ( leftChar )
284+ ? 1
285+ : - 1 ;
289286 }
290- // Increment by 2 for surrogate pairs, 1 otherwise
291- i += leftCodePoint > 0xffff ? 2 : 1 ;
292287 }
293288
294- // Compare lengths if all characters are equal
289+ // Use the lengths of the strings to determine the overall comparison result since either the
290+ // strings were equal or one is a prefix of the other.
295291 return primitiveComparator ( left . length , right . length ) ;
296292}
297293
298- function getUtf8SafeSubstring ( str : string , index : number ) : string {
299- const firstCodePoint = str . codePointAt ( index ) ! ;
300- if ( firstCodePoint > 0xffff ) {
301- // It's a surrogate pair, return the whole pair
302- return str . substring ( index , index + 2 ) ;
303- } else {
304- // It's a single code point, return it
305- return str . substring ( index , index + 1 ) ;
306- }
294+ const MIN_SURROGATE = 0xd800 ;
295+ const MAX_SURROGATE = 0xdfff ;
296+
297+ function isSurrogate ( s : string ) : boolean {
298+ const c = s . charCodeAt ( 0 ) ;
299+ return c >= MIN_SURROGATE && c <= MAX_SURROGATE ;
307300}
308301
309302/*!
0 commit comments