Skip to content

Commit 881ac26

Browse files
felixgetjfontaine
authored andcommitted
deps/v8: Apply REPLACE_INVALID_UTF8 patch
- https://codereview.chromium.org/121173009/ - https://code.google.com/p/v8/source/detail?r=18683 Note: The v8 test case did not cleanly apply, so it's missing from this patch. I'm assuming this is not a problem if the v8 test suite is not part of the node build / test system. If that's the case I'll fix it. Otherwise the test case will be integrated once v8 is upgraded.
1 parent 80eff96 commit 881ac26

File tree

4 files changed

+55
-17
lines changed

4 files changed

+55
-17
lines changed

deps/v8/include/v8.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1076,7 +1076,11 @@ class String : public Primitive {
10761076
NO_OPTIONS = 0,
10771077
HINT_MANY_WRITES_EXPECTED = 1,
10781078
NO_NULL_TERMINATION = 2,
1079-
PRESERVE_ASCII_NULL = 4
1079+
PRESERVE_ASCII_NULL = 4,
1080+
// Used by WriteUtf8 to replace orphan surrogate code units with the
1081+
// unicode replacement character. Needs to be set to guarantee valid UTF-8
1082+
// output.
1083+
REPLACE_INVALID_UTF8 = 8
10801084
};
10811085

10821086
// 16-bit character codes.

deps/v8/src/api.cc

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3759,7 +3759,8 @@ static int RecursivelySerializeToUtf8(i::String* string,
37593759
int end,
37603760
int recursion_budget,
37613761
int32_t previous_character,
3762-
int32_t* last_character) {
3762+
int32_t* last_character,
3763+
bool replace_invalid_utf8) {
37633764
int utf8_bytes = 0;
37643765
while (true) {
37653766
if (string->IsAsciiRepresentation()) {
@@ -3775,7 +3776,10 @@ static int RecursivelySerializeToUtf8(i::String* string,
37753776
for (int i = start; i < end; i++) {
37763777
uint16_t character = data[i];
37773778
current +=
3778-
unibrow::Utf8::Encode(current, character, previous_character);
3779+
unibrow::Utf8::Encode(current,
3780+
character,
3781+
previous_character,
3782+
replace_invalid_utf8);
37793783
previous_character = character;
37803784
}
37813785
*last_character = previous_character;
@@ -3788,7 +3792,10 @@ static int RecursivelySerializeToUtf8(i::String* string,
37883792
for (int i = start; i < end; i++) {
37893793
uint16_t character = data[i];
37903794
current +=
3791-
unibrow::Utf8::Encode(current, character, previous_character);
3795+
unibrow::Utf8::Encode(current,
3796+
character,
3797+
previous_character,
3798+
replace_invalid_utf8);
37923799
previous_character = character;
37933800
}
37943801
*last_character = previous_character;
@@ -3824,7 +3831,8 @@ static int RecursivelySerializeToUtf8(i::String* string,
38243831
boundary,
38253832
recursion_budget - 1,
38263833
previous_character,
3827-
&previous_character);
3834+
&previous_character,
3835+
replace_invalid_utf8);
38283836
if (extra_utf8_bytes < 0) return extra_utf8_bytes;
38293837
buffer += extra_utf8_bytes;
38303838
utf8_bytes += extra_utf8_bytes;
@@ -3879,7 +3887,10 @@ int String::WriteUtf8(char* buffer,
38793887
return len;
38803888
}
38813889

3882-
if (capacity == -1 || capacity / 3 >= string_length) {
3890+
bool replace_invalid_utf8 = (options & REPLACE_INVALID_UTF8);
3891+
int max16BitCodeUnitSize = unibrow::Utf8::kMax16BitCodeUnitSize;
3892+
3893+
if (capacity == -1 || capacity / max16BitCodeUnitSize >= string_length) {
38833894
int32_t previous = unibrow::Utf16::kNoPreviousCharacter;
38843895
const int kMaxRecursion = 100;
38853896
int utf8_bytes =
@@ -3889,7 +3900,8 @@ int String::WriteUtf8(char* buffer,
38893900
string_length,
38903901
kMaxRecursion,
38913902
previous,
3892-
&previous);
3903+
&previous,
3904+
replace_invalid_utf8);
38933905
if (utf8_bytes >= 0) {
38943906
// Success serializing with recursion.
38953907
if ((options & NO_NULL_TERMINATION) == 0 &&
@@ -3942,22 +3954,25 @@ int String::WriteUtf8(char* buffer,
39423954
char intermediate[unibrow::Utf8::kMaxEncodedSize];
39433955
for (; i < len && pos < capacity; i++) {
39443956
i::uc32 c = write_input_buffer.GetNext();
3945-
if (unibrow::Utf16::IsTrailSurrogate(c) &&
3946-
unibrow::Utf16::IsLeadSurrogate(previous)) {
3957+
if (unibrow::Utf16::IsSurrogatePair(previous, c)) {
39473958
// We can't use the intermediate buffer here because the encoding
39483959
// of surrogate pairs is done under assumption that you can step
39493960
// back and fix the UTF8 stream. Luckily we only need space for one
39503961
// more byte, so there is always space.
39513962
ASSERT(pos < capacity);
3952-
int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
3963+
int written = unibrow::Utf8::Encode(buffer + pos,
3964+
c,
3965+
previous,
3966+
replace_invalid_utf8);
39533967
ASSERT(written == 1);
39543968
pos += written;
39553969
nchars++;
39563970
} else {
39573971
int written =
39583972
unibrow::Utf8::Encode(intermediate,
39593973
c,
3960-
unibrow::Utf16::kNoPreviousCharacter);
3974+
unibrow::Utf16::kNoPreviousCharacter,
3975+
replace_invalid_utf8);
39613976
if (pos + written <= capacity) {
39623977
for (int j = 0; j < written; j++) {
39633978
buffer[pos + j] = intermediate[j];

deps/v8/src/unicode-inl.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,10 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
7979
}
8080

8181

82-
unsigned Utf8::Encode(char* str, uchar c, int previous) {
82+
unsigned Utf8::Encode(char* str,
83+
uchar c,
84+
int previous,
85+
bool replace_invalid) {
8386
static const int kMask = ~(1 << 6);
8487
if (c <= kMaxOneByteChar) {
8588
str[0] = c;
@@ -89,12 +92,16 @@ unsigned Utf8::Encode(char* str, uchar c, int previous) {
8992
str[1] = 0x80 | (c & kMask);
9093
return 2;
9194
} else if (c <= kMaxThreeByteChar) {
92-
if (Utf16::IsTrailSurrogate(c) &&
93-
Utf16::IsLeadSurrogate(previous)) {
95+
if (Utf16::IsSurrogatePair(previous, c)) {
9496
const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
9597
return Encode(str - kUnmatchedSize,
9698
Utf16::CombineSurrogatePair(previous, c),
97-
Utf16::kNoPreviousCharacter) - kUnmatchedSize;
99+
Utf16::kNoPreviousCharacter,
100+
replace_invalid) - kUnmatchedSize;
101+
} else if (replace_invalid &&
102+
(Utf16::IsLeadSurrogate(c) ||
103+
Utf16::IsTrailSurrogate(c))) {
104+
c = kBadChar;
98105
}
99106
str[0] = 0xE0 | (c >> 12);
100107
str[1] = 0x80 | ((c >> 6) & kMask);

deps/v8/src/unicode.h

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,9 @@ class Buffer {
117117

118118
class Utf16 {
119119
public:
120+
static inline bool IsSurrogatePair(int lead, int trail) {
121+
return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
122+
}
120123
static inline bool IsLeadSurrogate(int code) {
121124
if (code == kNoPreviousCharacter) return false;
122125
return (code & 0xfc00) == 0xd800;
@@ -152,13 +155,19 @@ class Utf16 {
152155
class Utf8 {
153156
public:
154157
static inline uchar Length(uchar chr, int previous);
155-
static inline unsigned Encode(
156-
char* out, uchar c, int previous);
158+
static inline unsigned Encode(char* out,
159+
uchar c,
160+
int previous,
161+
bool replace_invalid = false);
157162
static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
158163
unsigned capacity, unsigned* chars_read, unsigned* offset);
159164
static uchar CalculateValue(const byte* str,
160165
unsigned length,
161166
unsigned* cursor);
167+
168+
169+
// The unicode replacement character, used to signal invalid unicode
170+
// sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding.
162171
static const uchar kBadChar = 0xFFFD;
163172
static const unsigned kMaxEncodedSize = 4;
164173
static const unsigned kMaxOneByteChar = 0x7f;
@@ -170,6 +179,9 @@ class Utf8 {
170179
// that match are coded as a 4 byte UTF-8 sequence.
171180
static const unsigned kBytesSavedByCombiningSurrogates = 2;
172181
static const unsigned kSizeOfUnmatchedSurrogate = 3;
182+
// The maximum size a single UTF-16 code unit may take up when encoded as
183+
// UTF-8.
184+
static const unsigned kMax16BitCodeUnitSize = 3;
173185

174186
private:
175187
template <unsigned s> friend class Utf8InputBuffer;

0 commit comments

Comments
 (0)