diff --git a/src/node_buffer.cc b/src/node_buffer.cc index 2778422ea4e7b7..2c6ddccc77dcfa 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -760,10 +760,23 @@ void StringWrite(const FunctionCallbackInfo& args) { void SlowByteLengthUtf8(const FunctionCallbackInfo& args) { CHECK(args[0]->IsString()); - - // Fast case: avoid StringBytes on UTF8 string. Jump to v8. - size_t result = args[0].As()->Utf8LengthV2(args.GetIsolate()); - args.GetReturnValue().Set(static_cast(result)); + Isolate* isolate = args.GetIsolate(); + Local str = args[0].As(); + + // Below ~512 units, or for one-byte, V8's Utf8LengthV2 is faster. + if (str->Length() >= 512 && !str->IsOneByte()) { + String::ValueView view(isolate, str); + if (!view.is_one_byte()) { + // with_replacement matches Buffer.from's U+FFFD for lone surrogates. + size_t result = + simdutf::utf8_length_from_utf16_with_replacement( + reinterpret_cast(view.data16()), view.length()) + .count; + args.GetReturnValue().Set(static_cast(result)); + return; + } + } + args.GetReturnValue().Set(static_cast(str->Utf8LengthV2(isolate))); } uint32_t FastByteLengthUtf8( @@ -777,6 +790,17 @@ uint32_t FastByteLengthUtf8( Local sourceStr = sourceValue.As(); if (!sourceStr->IsExternalOneByte()) { + // Below ~512 units, or for one-byte, V8's Utf8LengthV2 is faster. + if (sourceStr->Length() >= 512 && !sourceStr->IsOneByte()) { + String::ValueView view(isolate, sourceStr); + if (!view.is_one_byte()) { + // with_replacement matches Buffer.from's U+FFFD for lone surrogates. + return simdutf::utf8_length_from_utf16_with_replacement( + reinterpret_cast(view.data16()), + view.length()) + .count; + } + } return sourceStr->Utf8LengthV2(isolate); } auto source = sourceStr->GetExternalOneByteStringResource(); diff --git a/test/parallel/test-buffer-bytelength.js b/test/parallel/test-buffer-bytelength.js index fec508ffb4048a..267c1e26e876ab 100644 --- a/test/parallel/test-buffer-bytelength.js +++ b/test/parallel/test-buffer-bytelength.js @@ -125,3 +125,49 @@ for (let i = 1; i < 10; i++) { assert.strictEqual(Buffer.byteLength('foo', encoding), Buffer.byteLength('foo', 'utf8')); } + +// byteLength('utf8') must equal the bytes Buffer.from writes, including +// unpaired surrogates (3-byte U+FFFD). Large inputs exercise the SIMD path. +{ + const HI = '\uD83D'; // Unpaired high surrogate + const LO = '\uDE00'; // Unpaired low surrogate + const PAIR = '\u{1F600}'; // Valid surrogate pair (4 bytes) + const enc = new TextEncoder(); + + // Independent UTF-8 byte-length oracle with replacement semantics. + const utf8Bytes = (str) => { + let n = 0; + for (let i = 0; i < str.length; i++) { + const c = str.charCodeAt(i); + if (c < 0x80) n += 1; + else if (c < 0x800) n += 2; + else if (c >= 0xD800 && c <= 0xDBFF) { + const next = str.charCodeAt(i + 1); + if (next >= 0xDC00 && next <= 0xDFFF) { n += 4; i++; } else n += 3; + } else n += 3; // BMP >= 0x800, incl. unpaired low surrogate + } + return n; + }; + + const cases = [ + 'a'.repeat(300) + HI + 'b'.repeat(300), // Unpaired high inside large 2-byte + 'a'.repeat(300) + LO + 'b'.repeat(300), // Unpaired low inside large 2-byte + HI.repeat(200), // many unpaired highs + (HI + LO).repeat(200), // Reversed order, still unpaired + PAIR.repeat(200), // valid pairs, large + '中'.repeat(500), // BMP 3-byte, large + 'é'.repeat(500), // latin1, large + `a中${PAIR}${HI}é`.repeat(100), // mixed, large + `A${HI}`, // Tiny, below SIMD threshold + HI, // Single unpaired surrogate + '', + ]; + + for (const s of cases) { + const ref = enc.encode(s); // WHATWG ground-truth UTF-8 bytes + const label = JSON.stringify(s).slice(0, 32); + assert.strictEqual(utf8Bytes(s), ref.length, `oracle vs TextEncoder: ${label}`); + assert.strictEqual(Buffer.byteLength(s, 'utf8'), ref.length, `byteLength: ${label}`); + assert.deepStrictEqual(new Uint8Array(Buffer.from(s, 'utf8')), ref, `bytes: ${label}`); + } +}