|
14 | 14 |
|
15 | 15 | package com.google.firebase.firestore.util;
|
16 | 16 |
|
| 17 | +import static java.lang.Character.isSurrogate; |
| 18 | + |
17 | 19 | import android.annotation.SuppressLint;
|
18 | 20 | import android.os.Handler;
|
19 | 21 | import android.os.Looper;
|
@@ -87,46 +89,46 @@ public static int compareIntegers(int i1, int i2) {
|
87 | 89 |
|
88 | 90 | /** Compare strings in UTF-8 encoded byte order */
|
89 | 91 | public static int compareUtf8Strings(String left, String right) {
|
90 |
| - int i = 0; |
91 |
| - while (i < left.length() && i < right.length()) { |
92 |
| - int leftCodePoint = left.codePointAt(i); |
93 |
| - int rightCodePoint = right.codePointAt(i); |
94 |
| - |
95 |
| - if (leftCodePoint != rightCodePoint) { |
96 |
| - if (leftCodePoint < 128 && rightCodePoint < 128) { |
97 |
| - // ASCII comparison |
98 |
| - return Integer.compare(leftCodePoint, rightCodePoint); |
99 |
| - } else { |
100 |
| - // substring and do UTF-8 encoded byte comparison |
101 |
| - ByteString leftBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(left, i)); |
102 |
| - ByteString rightBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(right, i)); |
103 |
| - int comp = compareByteStrings(leftBytes, rightBytes); |
104 |
| - if (comp != 0) { |
105 |
| - return comp; |
106 |
| - } else { |
107 |
| - // EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte representations are |
108 |
| - // identical. This can happen with malformed input (invalid surrogate pairs), where |
109 |
| - // Java's encoding leads to unexpected byte sequences. Meanwhile, any invalid surrogate |
110 |
| - // inputs get converted to "?" by protocol buffer while round tripping, so we almost |
111 |
| - // never receive invalid strings from backend. |
112 |
| - // Fallback to code point comparison for graceful handling. |
113 |
| - return Integer.compare(leftCodePoint, rightCodePoint); |
114 |
| - } |
115 |
| - } |
| 92 | + // noinspection StringEquality |
| 93 | + if (left == right) { |
| 94 | + return 0; |
| 95 | + } |
| 96 | + |
| 97 | + // Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and, |
| 98 | + // if found, use that character to determine the relative ordering of the two strings as a |
| 99 | + // whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by |
| 100 | + // comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8 |
| 101 | + // and UTF-16 happen to represent Unicode code points. |
| 102 | + // |
| 103 | + // After finding the first pair of differing characters, there are two cases: |
| 104 | + // |
| 105 | + // Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or |
| 106 | + // both are surrogates from a surrogate pair (that collectively represent code points greater |
| 107 | + // than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the |
| 108 | + // lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is |
| 109 | + // sufficient. |
| 110 | + // |
| 111 | + // Case 2: One character is a surrogate and the other is not. In this case the surrogate- |
| 112 | + // containing string is always ordered after the non-surrogate. This is because surrogates are |
| 113 | + // used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations |
| 114 | + // and are lexicographically greater than the 1, 2, or 3-byte representations of code points |
| 115 | + // less than or equal to 0xFFFF. |
| 116 | + final int length = Math.min(left.length(), right.length()); |
| 117 | + for (int i = 0; i < length; i++) { |
| 118 | + final char leftChar = left.charAt(i); |
| 119 | + final char rightChar = right.charAt(i); |
| 120 | + if (leftChar != rightChar) { |
| 121 | + return (isSurrogate(leftChar) == isSurrogate(rightChar)) |
| 122 | + ? Util.compareIntegers(leftChar, rightChar) |
| 123 | + : isSurrogate(leftChar) ? 1 : -1; |
116 | 124 | }
|
117 |
| - // Increment by 2 for surrogate pairs, 1 otherwise. |
118 |
| - i += Character.charCount(leftCodePoint); |
119 | 125 | }
|
120 | 126 |
|
121 |
| - // Compare lengths if all characters are equal |
| 127 | + // Use the lengths of the strings to determine the overall comparison result since either the |
| 128 | + // strings were equal or one is a prefix of the other. |
122 | 129 | return Integer.compare(left.length(), right.length());
|
123 | 130 | }
|
124 | 131 |
|
125 |
| - private static String getUtf8SafeBytes(String str, int index) { |
126 |
| - int firstCodePoint = str.codePointAt(index); |
127 |
| - return str.substring(index, index + Character.charCount(firstCodePoint)); |
128 |
| - } |
129 |
| - |
130 | 132 | /**
|
131 | 133 | * Utility function to compare longs. Note that we can't use Long.compare because it's only
|
132 | 134 | * available after Android 19.
|
|
0 commit comments