Skip to content

Commit 84c4d81

Browse files
authored
fix(firestore): Further improved performance of UTF-8 string comparison logic (#7098)
1 parent aad3da8 commit 84c4d81

File tree

2 files changed

+39
-34
lines changed

2 files changed

+39
-34
lines changed

firebase-firestore/CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
# Unreleased
2+
* [fixed] Further improved performance of UTF-8 string ordering logic,
3+
which had degraded in v25.1.2 and received some improvements in v25.1.3.
4+
[#7053](//github.com/firebase/firebase-android-sdk/issues/7053)
25

36

47
# 25.1.4

firebase-firestore/src/main/java/com/google/firebase/firestore/util/Util.java

Lines changed: 36 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
package com.google.firebase.firestore.util;
1616

17+
import static java.lang.Character.isSurrogate;
18+
1719
import android.annotation.SuppressLint;
1820
import android.os.Handler;
1921
import android.os.Looper;
@@ -87,46 +89,46 @@ public static int compareIntegers(int i1, int i2) {
8789

8890
/** Compare strings in UTF-8 encoded byte order */
8991
public static int compareUtf8Strings(String left, String right) {
90-
int i = 0;
91-
while (i < left.length() && i < right.length()) {
92-
int leftCodePoint = left.codePointAt(i);
93-
int rightCodePoint = right.codePointAt(i);
94-
95-
if (leftCodePoint != rightCodePoint) {
96-
if (leftCodePoint < 128 && rightCodePoint < 128) {
97-
// ASCII comparison
98-
return Integer.compare(leftCodePoint, rightCodePoint);
99-
} else {
100-
// substring and do UTF-8 encoded byte comparison
101-
ByteString leftBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(left, i));
102-
ByteString rightBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(right, i));
103-
int comp = compareByteStrings(leftBytes, rightBytes);
104-
if (comp != 0) {
105-
return comp;
106-
} else {
107-
// EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte representations are
108-
// identical. This can happen with malformed input (invalid surrogate pairs), where
109-
// Java's encoding leads to unexpected byte sequences. Meanwhile, any invalid surrogate
110-
// inputs get converted to "?" by protocol buffer while round tripping, so we almost
111-
// never receive invalid strings from backend.
112-
// Fallback to code point comparison for graceful handling.
113-
return Integer.compare(leftCodePoint, rightCodePoint);
114-
}
115-
}
92+
// noinspection StringEquality
93+
if (left == right) {
94+
return 0;
95+
}
96+
97+
// Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and,
98+
// if found, use that character to determine the relative ordering of the two strings as a
99+
// whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by
100+
// comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8
101+
// and UTF-16 happen to represent Unicode code points.
102+
//
103+
// After finding the first pair of differing characters, there are two cases:
104+
//
105+
// Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or
106+
// both are surrogates from a surrogate pair (that collectively represent code points greater
107+
// than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the
108+
// lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is
109+
// sufficient.
110+
//
111+
// Case 2: One character is a surrogate and the other is not. In this case the surrogate-
112+
// containing string is always ordered after the non-surrogate. This is because surrogates are
113+
// used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations
114+
// and are lexicographically greater than the 1, 2, or 3-byte representations of code points
115+
// less than or equal to 0xFFFF.
116+
final int length = Math.min(left.length(), right.length());
117+
for (int i = 0; i < length; i++) {
118+
final char leftChar = left.charAt(i);
119+
final char rightChar = right.charAt(i);
120+
if (leftChar != rightChar) {
121+
return (isSurrogate(leftChar) == isSurrogate(rightChar))
122+
? Util.compareIntegers(leftChar, rightChar)
123+
: isSurrogate(leftChar) ? 1 : -1;
116124
}
117-
// Increment by 2 for surrogate pairs, 1 otherwise.
118-
i += Character.charCount(leftCodePoint);
119125
}
120126

121-
// Compare lengths if all characters are equal
127+
// Use the lengths of the strings to determine the overall comparison result since either the
128+
// strings were equal or one is a prefix of the other.
122129
return Integer.compare(left.length(), right.length());
123130
}
124131

125-
private static String getUtf8SafeBytes(String str, int index) {
126-
int firstCodePoint = str.codePointAt(index);
127-
return str.substring(index, index + Character.charCount(firstCodePoint));
128-
}
129-
130132
/**
131133
* Utility function to compare longs. Note that we can't use Long.compare because it's only
132134
* available after Android 19.

0 commit comments

Comments
 (0)