|
16 | 16 |
|
17 | 17 | package com.google.cloud.firestore; |
18 | 18 |
|
| 19 | +import static java.lang.Character.isSurrogate; |
| 20 | + |
19 | 21 | import com.google.firestore.v1.MapValue; |
20 | 22 | import com.google.firestore.v1.Value; |
21 | 23 | import com.google.firestore.v1.Value.ValueTypeCase; |
@@ -136,46 +138,46 @@ public int compare(@Nonnull Value left, @Nonnull Value right) { |
136 | 138 |
|
137 | 139 | /** Compare strings in UTF-8 encoded byte order */ |
138 | 140 | public static int compareUtf8Strings(String left, String right) { |
139 | | - int i = 0; |
140 | | - while (i < left.length() && i < right.length()) { |
141 | | - int leftCodePoint = left.codePointAt(i); |
142 | | - int rightCodePoint = right.codePointAt(i); |
143 | | - |
144 | | - if (leftCodePoint != rightCodePoint) { |
145 | | - if (leftCodePoint < 128 && rightCodePoint < 128) { |
146 | | - // ASCII comparison |
147 | | - return Integer.compare(leftCodePoint, rightCodePoint); |
148 | | - } else { |
149 | | - // UTF-8 encode the character at index i for byte comparison. |
150 | | - ByteString leftBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(left, i)); |
151 | | - ByteString rightBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(right, i)); |
152 | | - int comp = compareByteStrings(leftBytes, rightBytes); |
153 | | - if (comp != 0) { |
154 | | - return comp; |
155 | | - } else { |
156 | | - // EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte representations are |
157 | | - // identical. This can happen with malformed input (invalid surrogate pairs), where |
158 | | - // Java's encoding leads to unexpected byte sequences. Meanwhile, any invalid surrogate |
159 | | - // inputs get converted to "?" by protocol buffer while round tripping, so we almost |
160 | | - // never receive invalid strings from backend. |
161 | | - // Fallback to code point comparison for graceful handling. |
162 | | - return Integer.compare(leftCodePoint, rightCodePoint); |
163 | | - } |
164 | | - } |
| 141 | + // noinspection StringEquality |
| 142 | + if (left == right) { |
| 143 | + return 0; |
| 144 | + } |
| 145 | + |
| 146 | + // Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and, |
| 147 | + // if found, use that character to determine the relative ordering of the two strings as a |
| 148 | + // whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by |
| 149 | + // comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8 |
| 150 | + // and UTF-16 happen to represent Unicode code points. |
| 151 | + // |
| 152 | + // After finding the first pair of differing characters, there are two cases: |
| 153 | + // |
| 154 | + // Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or |
| 155 | + // both are surrogates from a surrogate pair (that collectively represent code points greater |
| 156 | + // than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the |
| 157 | + // lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is |
| 158 | + // sufficient. |
| 159 | + // |
| 160 | + // Case 2: One character is a surrogate and the other is not. In this case the surrogate- |
| 161 | + // containing string is always ordered after the non-surrogate. This is because surrogates are |
| 162 | + // used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations |
| 163 | + // and are lexicographically greater than the 1, 2, or 3-byte representations of code points |
| 164 | + // less than or equal to 0xFFFF. |
| 165 | + final int length = Math.min(left.length(), right.length()); |
| 166 | + for (int i = 0; i < length; i++) { |
| 167 | + final char leftChar = left.charAt(i); |
| 168 | + final char rightChar = right.charAt(i); |
| 169 | + if (leftChar != rightChar) { |
| 170 | + return (isSurrogate(leftChar) == isSurrogate(rightChar)) |
| 171 | + ? Character.compare(leftChar, rightChar) |
| 172 | + : isSurrogate(leftChar) ? 1 : -1; |
165 | 173 | } |
166 | | - // Increment by 2 for surrogate pairs, 1 otherwise. |
167 | | - i += Character.charCount(leftCodePoint); |
168 | 174 | } |
169 | 175 |
|
170 | | - // Compare lengths if all characters are equal |
| 176 | + // Use the lengths of the strings to determine the overall comparison result since either the |
| 177 | + // strings were equal or one is a prefix of the other. |
171 | 178 | return Integer.compare(left.length(), right.length()); |
172 | 179 | } |
173 | 180 |
|
174 | | - private static String getUtf8SafeBytes(String str, int index) { |
175 | | - int firstCodePoint = str.codePointAt(index); |
176 | | - return str.substring(index, index + Character.charCount(firstCodePoint)); |
177 | | - } |
178 | | - |
179 | 181 | private int compareBlobs(Value left, Value right) { |
180 | 182 | ByteString leftBytes = left.getBytesValue(); |
181 | 183 | ByteString rightBytes = right.getBytesValue(); |
|
0 commit comments