From 65e06d9cf3b88e9b1d75c357baf5326c28d8876c Mon Sep 17 00:00:00 2001 From: theweipeng Date: Sat, 21 Dec 2024 21:36:22 +0800 Subject: [PATCH 1/4] src: detect whether the string is one byte representation or not References: nodejs#56090 --- doc/api/v8.md | 109 ++++++++++++++++++ lib/v8.js | 13 +++ src/node_external_reference.h | 4 + src/node_v8.cc | 28 +++++ ...st-v8-string-is-one-byte-representation.js | 37 ++++++ 5 files changed, 191 insertions(+) create mode 100644 test/parallel/test-v8-string-is-one-byte-representation.js diff --git a/doc/api/v8.md b/doc/api/v8.md index 670283e17f5d80..26b77b3adce39d 100644 --- a/doc/api/v8.md +++ b/doc/api/v8.md @@ -1304,6 +1304,115 @@ setTimeout(() => { }, 1000); ``` +## `v8.isStringOneByteRepresentation(content)` + + + +* `content` {string} +* Returns: {boolean} + +V8 only supports `Latin-1/ISO-8859-1` and `UTF16` as the underlying representations. +If the `content` uses `Latin-1/ISO-8859-1` as the underlying representation, this function will return true; +otherwise, it returns false. + +If this method returns false, that does not mean that the string contains some characters not in `Latin-1/ISO-8859-1`. +Sometimes a `Latin-1` string may also be represented as `UTF16`. + +```js +const { isStringOneByteRepresentation } = require('node:v8'); +const assert = require('node:assert'); + +const Encoding = { + latin1: 1, + utf16le: 2, +}; + +/** + * Read strings from the buffer. + * + * Note that this example ignores flag checks and boundary checks. + */ +class Deserializer { + buffer; + cursor; + + constructor(buffer) { + this.buffer = buffer; + this.cursor = 0; + } + + readString() { + const encoding = this.buffer.readUint8(this.cursor++); + const length = this.buffer.readUint32LE(this.cursor); + this.cursor += 4; + if (encoding === Encoding.latin1) { + const result = this.buffer.toString('latin1', this.cursor, this.cursor + length); + this.cursor += length; + return result; + } + const result = this.buffer.toString('utf16le', this.cursor, this.cursor + length); + this.cursor += length; + return result; + } +} + +/** + * By means of the `isStringOneByteRepresentation` function, + * we can write strings into the buffer with high performance, + * and it only takes the time of a memcopy. + * + * Note that this example ignores boundary checks. + */ +class Serializer { + buffer; + cursor; + + constructor() { + this.buffer = Buffer.alloc(100); + this.cursor = 0; + } + + /** + * step1: Write the encoding flag to the first byte. + * step2: Write the byte length of the string to the next four bytes. + * step3: Write the string to the buffer. + * @param {string} input + */ + writeString(input) { + if (isStringOneByteRepresentation(input)) { + this.buffer.writeUint8(Encoding.latin1, this.cursor++); + this.buffer.writeUint32LE(input.length, this.cursor); + this.cursor += 4; + this.buffer.write(input, this.cursor, 'latin1'); + this.cursor += input.length; + } else { + this.buffer.writeUint8(Encoding.utf16le, this.cursor++); + this.buffer.writeUint32LE(input.length * 2, this.cursor); + this.cursor += 4; + this.buffer.write(input, this.cursor, 'utf16le'); + this.cursor += input.length * 2; + } + } + + finish() { + return this.buffer.subarray(0, this.cursor); + } +} + +// Write strings to the buffer. +const serializer = new Serializer(); +serializer.writeString('hello'); +serializer.writeString('你好'); +const data = serializer.finish(); + +// Read strings from the buffer. +const deserializer = new Deserializer(data); +assert(deserializer.readString() === 'hello'); +assert(deserializer.readString() === '你好'); +``` + [HTML structured clone algorithm]: https://developer.mozilla.org/en-US/docs/Web/API/Web_Workers_API/Structured_clone_algorithm [Hook Callbacks]: #hook-callbacks [V8]: https://developers.google.com/v8/ diff --git a/lib/v8.js b/lib/v8.js index 7a8979887bab49..381aabfcbfafab 100644 --- a/lib/v8.js +++ b/lib/v8.js @@ -104,6 +104,7 @@ const binding = internalBinding('v8'); const { cachedDataVersionTag, setFlagsFromString: _setFlagsFromString, + isStringOneByteRepresentation: _isStringOneByteRepresentation, updateHeapStatisticsBuffer, updateHeapSpaceStatisticsBuffer, updateHeapCodeStatisticsBuffer, @@ -155,6 +156,17 @@ function setFlagsFromString(flags) { _setFlagsFromString(flags); } +/** + * Return whether this string uses one byte as underlying representation or not. + * @param {string} content + * @returns {boolean} + */ +function isStringOneByteRepresentation(content) { + validateString(content, 'content'); + return _isStringOneByteRepresentation(content); +} + + /** * Gets the current V8 heap statistics. * @returns {{ @@ -439,4 +451,5 @@ module.exports = { startupSnapshot, setHeapSnapshotNearHeapLimit, GCProfiler, + isStringOneByteRepresentation, }; diff --git a/src/node_external_reference.h b/src/node_external_reference.h index 8d49a119c21832..bb007dbdcce486 100644 --- a/src/node_external_reference.h +++ b/src/node_external_reference.h @@ -12,6 +12,9 @@ namespace node { using CFunctionCallbackWithOneByteString = uint32_t (*)(v8::Local, const v8::FastOneByteString&); + +using CFunctionCallbackReturnBool = bool (*)(v8::Local unused, + v8::Local receiver); using CFunctionCallback = void (*)(v8::Local unused, v8::Local receiver); using CFunctionCallbackReturnDouble = @@ -90,6 +93,7 @@ class ExternalReferenceRegistry { #define ALLOWED_EXTERNAL_REFERENCE_TYPES(V) \ V(CFunctionCallback) \ V(CFunctionCallbackWithOneByteString) \ + V(CFunctionCallbackReturnBool) \ V(CFunctionCallbackReturnDouble) \ V(CFunctionCallbackReturnInt32) \ V(CFunctionCallbackValueReturnDouble) \ diff --git a/src/node_v8.cc b/src/node_v8.cc index a7f0ba7973498e..eecf09f048891d 100644 --- a/src/node_v8.cc +++ b/src/node_v8.cc @@ -32,6 +32,7 @@ namespace node { namespace v8_utils { using v8::Array; +using v8::CFunction; using v8::Context; using v8::FunctionCallbackInfo; using v8::FunctionTemplate; @@ -238,6 +239,23 @@ void SetFlagsFromString(const FunctionCallbackInfo& args) { V8::SetFlagsFromString(*flags, static_cast(flags.length())); } +static void IsStringOneByteRepresentation( + const FunctionCallbackInfo& args) { + CHECK_EQ(args.Length(), 1); + CHECK(args[0]->IsString()); + bool is_one_byte = args[0].As()->IsOneByte(); + args.GetReturnValue().Set(is_one_byte); +} + +static bool FastIsStringOneByteRepresentation(Local receiver, + const Local target) { + CHECK(target->IsString()); + return target.As()->IsOneByte(); +} + +CFunction fast_is_string_one_byte_representation_( + CFunction::Make(FastIsStringOneByteRepresentation)); + static const char* GetGCTypeName(v8::GCType gc_type) { switch (gc_type) { case v8::GCType::kGCTypeScavenge: @@ -479,6 +497,13 @@ void Initialize(Local target, // Export symbols used by v8.setFlagsFromString() SetMethod(context, target, "setFlagsFromString", SetFlagsFromString); + // Export symbols used by v8.isStringOneByteRepresentation() + SetFastMethodNoSideEffect(context, + target, + "isStringOneByteRepresentation", + IsStringOneByteRepresentation, + &fast_is_string_one_byte_representation_); + // GCProfiler Local t = NewFunctionTemplate(env->isolate(), GCProfiler::New); @@ -498,6 +523,9 @@ void RegisterExternalReferences(ExternalReferenceRegistry* registry) { registry->Register(GCProfiler::New); registry->Register(GCProfiler::Start); registry->Register(GCProfiler::Stop); + registry->Register(IsStringOneByteRepresentation); + registry->Register(FastIsStringOneByteRepresentation); + registry->Register(fast_is_string_one_byte_representation_.GetTypeInfo()); } } // namespace v8_utils diff --git a/test/parallel/test-v8-string-is-one-byte-representation.js b/test/parallel/test-v8-string-is-one-byte-representation.js new file mode 100644 index 00000000000000..0403299c01015f --- /dev/null +++ b/test/parallel/test-v8-string-is-one-byte-representation.js @@ -0,0 +1,37 @@ +// Flags: --expose-internals +'use strict'; +require('../common'); +const assert = require('assert'); +const { isStringOneByteRepresentation } = require('v8'); + +[ + undefined, + null, + false, + 5n, + 5, + Symbol(), + () => {}, + {}, +].forEach((value) => { + assert.throws( + () => { isStringOneByteRepresentation(value); }, + /The "content" argument must be of type string/ + ); +}); + +{ + const latin1String = 'hello world!'; + // Run this inside a for loop to trigger the fast API + for (let i = 0; i < 10_000; i++) { + assert.strictEqual(isStringOneByteRepresentation(latin1String), true); + } +} + +{ + const utf16String = '你好😀😃'; + // Run this inside a for loop to trigger the fast API + for (let i = 0; i < 10_000; i++) { + assert.strictEqual(isStringOneByteRepresentation(utf16String), false); + } +} From 56278fe01a85023ac399a5600a79f762dbe31307 Mon Sep 17 00:00:00 2001 From: theweipeng Date: Mon, 30 Dec 2024 21:39:27 +0800 Subject: [PATCH 2/4] doc: optimize the document --- doc/api/v8.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/api/v8.md b/doc/api/v8.md index 26b77b3adce39d..97c7cad6783029 100644 --- a/doc/api/v8.md +++ b/doc/api/v8.md @@ -1313,7 +1313,7 @@ added: REPLACEME * `content` {string} * Returns: {boolean} -V8 only supports `Latin-1/ISO-8859-1` and `UTF16` as the underlying representations. +V8 only supports `Latin-1/ISO-8859-1` and `UTF16` as the underlying representation of a string. If the `content` uses `Latin-1/ISO-8859-1` as the underlying representation, this function will return true; otherwise, it returns false. @@ -1322,7 +1322,7 @@ Sometimes a `Latin-1` string may also be represented as `UTF16`. ```js const { isStringOneByteRepresentation } = require('node:v8'); -const assert = require('node:assert'); +const { strictEqual } = require('node:assert'); const Encoding = { latin1: 1, @@ -1409,8 +1409,8 @@ const data = serializer.finish(); // Read strings from the buffer. const deserializer = new Deserializer(data); -assert(deserializer.readString() === 'hello'); -assert(deserializer.readString() === '你好'); +strictEqual(deserializer.readString(), 'hello'); +strictEqual(deserializer.readString(), '你好'); ``` [HTML structured clone algorithm]: https://developer.mozilla.org/en-US/docs/Web/API/Web_Workers_API/Structured_clone_algorithm From 34d144f4ff5673a7710c0b4ffff27bb5ef4acfb7 Mon Sep 17 00:00:00 2001 From: theweipeng Date: Fri, 3 Jan 2025 20:06:13 +0800 Subject: [PATCH 3/4] doc: simplify the document --- doc/api/v8.md | 94 +++++++-------------------------------------------- 1 file changed, 12 insertions(+), 82 deletions(-) diff --git a/doc/api/v8.md b/doc/api/v8.md index 97c7cad6783029..f67e56697417c7 100644 --- a/doc/api/v8.md +++ b/doc/api/v8.md @@ -1322,95 +1322,25 @@ Sometimes a `Latin-1` string may also be represented as `UTF16`. ```js const { isStringOneByteRepresentation } = require('node:v8'); -const { strictEqual } = require('node:assert'); const Encoding = { latin1: 1, utf16le: 2, }; - -/** - * Read strings from the buffer. - * - * Note that this example ignores flag checks and boundary checks. - */ -class Deserializer { - buffer; - cursor; - - constructor(buffer) { - this.buffer = buffer; - this.cursor = 0; - } - - readString() { - const encoding = this.buffer.readUint8(this.cursor++); - const length = this.buffer.readUint32LE(this.cursor); - this.cursor += 4; - if (encoding === Encoding.latin1) { - const result = this.buffer.toString('latin1', this.cursor, this.cursor + length); - this.cursor += length; - return result; - } - const result = this.buffer.toString('utf16le', this.cursor, this.cursor + length); - this.cursor += length; - return result; - } -} - -/** - * By means of the `isStringOneByteRepresentation` function, - * we can write strings into the buffer with high performance, - * and it only takes the time of a memcopy. - * - * Note that this example ignores boundary checks. - */ -class Serializer { - buffer; - cursor; - - constructor() { - this.buffer = Buffer.alloc(100); - this.cursor = 0; - } - - /** - * step1: Write the encoding flag to the first byte. - * step2: Write the byte length of the string to the next four bytes. - * step3: Write the string to the buffer. - * @param {string} input - */ - writeString(input) { - if (isStringOneByteRepresentation(input)) { - this.buffer.writeUint8(Encoding.latin1, this.cursor++); - this.buffer.writeUint32LE(input.length, this.cursor); - this.cursor += 4; - this.buffer.write(input, this.cursor, 'latin1'); - this.cursor += input.length; - } else { - this.buffer.writeUint8(Encoding.utf16le, this.cursor++); - this.buffer.writeUint32LE(input.length * 2, this.cursor); - this.cursor += 4; - this.buffer.write(input, this.cursor, 'utf16le'); - this.cursor += input.length * 2; - } - } - - finish() { - return this.buffer.subarray(0, this.cursor); +const buffer = Buffer.alloc(100); +function writeString(input) { + if (isStringOneByteRepresentation(input)) { + buffer.writeUint8(Encoding.latin1); + buffer.writeUint32LE(input.length, 1); + buffer.write(input, 5, 'latin1'); + } else { + buffer.writeUint8(Encoding.utf16le); + buffer.writeUint32LE(input.length * 2, 1); + buffer.write(input, 5, 'utf16le'); } } - -// Write strings to the buffer. -const serializer = new Serializer(); -serializer.writeString('hello'); -serializer.writeString('你好'); -const data = serializer.finish(); - -// Read strings from the buffer. -const deserializer = new Deserializer(data); -strictEqual(deserializer.readString(), 'hello'); -strictEqual(deserializer.readString(), '你好'); +writeString('hello'); +writeString('你好'); ``` [HTML structured clone algorithm]: https://developer.mozilla.org/en-US/docs/Web/API/Web_Workers_API/Structured_clone_algorithm From bd086ce48f85f9323dc386690861e0b3a0ef457b Mon Sep 17 00:00:00 2001 From: theweipeng Date: Fri, 3 Jan 2025 23:47:43 +0800 Subject: [PATCH 4/4] doc: remove unused flags --- test/parallel/test-v8-string-is-one-byte-representation.js | 1 - 1 file changed, 1 deletion(-) diff --git a/test/parallel/test-v8-string-is-one-byte-representation.js b/test/parallel/test-v8-string-is-one-byte-representation.js index 0403299c01015f..ba6eee3021d869 100644 --- a/test/parallel/test-v8-string-is-one-byte-representation.js +++ b/test/parallel/test-v8-string-is-one-byte-representation.js @@ -1,4 +1,3 @@ -// Flags: --expose-internals 'use strict'; require('../common'); const assert = require('assert');