Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 82 additions & 34 deletions lib/internal/encoding.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,34 @@
// https://encoding.spec.whatwg.org

const {
ArrayPrototypeMap,
Boolean,
ObjectDefineProperties,
ObjectGetOwnPropertyDescriptors,
ObjectSetPrototypeOf,
ObjectValues,
SafeArrayIterator,
SafeMap,
StringPrototypeSlice,
Symbol,
SymbolToStringTag,
Uint8Array,
} = primordials;

const { FastBuffer } = require('internal/buffer');

const {
ERR_ENCODING_NOT_SUPPORTED,
ERR_INVALID_ARG_TYPE,
ERR_INVALID_THIS,
ERR_NO_ICU,
} = require('internal/errors').codes;
const kMethod = Symbol('method');
const kHandle = Symbol('handle');
const kFlags = Symbol('flags');
const kEncoding = Symbol('encoding');
const kDecoder = Symbol('decoder');
const kFatal = Symbol('kFatal');
const kUTF8FastPath = Symbol('kUTF8FastPath');
const kWindows1252FastPath = Symbol('kWindows1252FastPath');
const kIgnoreBOM = Symbol('kIgnoreBOM');

const {
Expand All @@ -55,11 +58,9 @@ const {
encodeIntoResults,
encodeUtf8String,
decodeUTF8,
decodeWindows1252,
decodeSingleByte,
} = binding;

const { Buffer } = require('buffer');

function validateDecoder(obj) {
if (obj == null || obj[kDecoder] !== true)
throw new ERR_INVALID_THIS('TextDecoder');
Expand All @@ -69,7 +70,48 @@ const CONVERTER_FLAGS_FLUSH = 0x1;
const CONVERTER_FLAGS_FATAL = 0x2;
const CONVERTER_FLAGS_IGNORE_BOM = 0x4;

const empty = new Uint8Array(0);
const empty = new FastBuffer();

// Has to be synced with src/
const encodingsSinglebyte = new SafeMap(new SafeArrayIterator(ArrayPrototypeMap([
'ibm866',
'koi8-r',
'koi8-u',
'macintosh',
'x-mac-cyrillic',
'iso-8859-2',
'iso-8859-3',
'iso-8859-4',
'iso-8859-5',
'iso-8859-6',
'iso-8859-7',
'iso-8859-8',
'iso-8859-8-i',
'iso-8859-10',
'iso-8859-13',
'iso-8859-14',
'iso-8859-15',
'iso-8859-16',
'windows-874',
'windows-1250',
'windows-1251',
'windows-1252',
'windows-1253',
'windows-1254',
'windows-1255',
'windows-1256',
'windows-1257',
'windows-1258',
'x-user-defined', // Has to be last, special case
], (e, i) => [e, i])));

const isSinglebyteEncoding = (enc) => encodingsSinglebyte.has(enc);

function createSinglebyteDecoder(encoding, fatal) {
const key = encodingsSinglebyte.get(encoding);
if (key === undefined) throw new ERR_ENCODING_NOT_SUPPORTED(encoding);
return (buf) => decodeSingleByte(buf, key, fatal);
}

const encodings = new SafeMap([
['unicode-1-1-utf-8', 'utf-8'],
Expand Down Expand Up @@ -387,6 +429,24 @@ ObjectDefineProperties(
[SymbolToStringTag]: { __proto__: null, configurable: true, value: 'TextEncoder' },
});

function parseInput(input) {
if (isAnyArrayBuffer(input)) {
try {
return new FastBuffer(input);
} catch {
return empty;
}
} else if (isArrayBufferView(input)) {
try {
return new FastBuffer(input.buffer, input.byteOffset, input.byteLength);
} catch {
return empty;
}
} else {
throw new ERR_INVALID_ARG_TYPE('input', ['ArrayBuffer', 'ArrayBufferView'], input);
}
}

const TextDecoder =
internalBinding('config').hasIntl ?
makeTextDecoderICU() :
Expand Down Expand Up @@ -420,10 +480,12 @@ function makeTextDecoderICU() {
this[kFatal] = Boolean(options?.fatal);
// Only support fast path for UTF-8.
this[kUTF8FastPath] = enc === 'utf-8';
this[kWindows1252FastPath] = enc === 'windows-1252';
this[kHandle] = undefined;
this[kMethod] = undefined;

if (!this[kUTF8FastPath] && !this[kWindows1252FastPath]) {
if (isSinglebyteEncoding(this.encoding)) {
this[kMethod] = createSinglebyteDecoder(this.encoding, this[kFatal]);
} else if (!this[kUTF8FastPath]) {
this.#prepareConverter();
}
}
Expand All @@ -438,22 +500,18 @@ function makeTextDecoderICU() {

decode(input = empty, options = kEmptyObject) {
validateDecoder(this);
validateObject(options, 'options', kValidateObjectAllowObjectsAndNull);

if (this[kMethod]) return this[kMethod](input);

this[kUTF8FastPath] &&= !(options?.stream);
this[kWindows1252FastPath] &&= !(options?.stream);

if (this[kUTF8FastPath]) {
return decodeUTF8(input, this[kIgnoreBOM], this[kFatal]);
}

if (this[kWindows1252FastPath]) {
return decodeWindows1252(input, this[kIgnoreBOM], this[kFatal]);
}

this.#prepareConverter();

validateObject(options, 'options', kValidateObjectAllowObjectsAndNull);

let flags = 0;
if (options !== null)
flags |= options.stream ? 0 : CONVERTER_FLAGS_FLUSH;
Expand All @@ -476,7 +534,7 @@ function makeTextDecoderJS() {
const kBOMSeen = Symbol('BOM seen');

function hasConverter(encoding) {
return encoding === 'utf-8' || encoding === 'utf-16le';
return encoding === 'utf-8' || encoding === 'utf-16le' || isSinglebyteEncoding(encoding);
}

class TextDecoder {
Expand All @@ -502,30 +560,20 @@ function makeTextDecoderJS() {
this[kFlags] = flags;
this[kEncoding] = enc;
this[kBOMSeen] = false;
this[kMethod] = undefined;

if (isSinglebyteEncoding(this.encoding)) {
this[kMethod] = createSinglebyteDecoder(this.encoding, this[kFatal]);
}
}

decode(input = empty, options = kEmptyObject) {
validateDecoder(this);
if (isAnyArrayBuffer(input)) {
try {
input = Buffer.from(input);
} catch {
input = empty;
}
} else if (isArrayBufferView(input)) {
try {
input = Buffer.from(input.buffer, input.byteOffset,
input.byteLength);
} catch {
input = empty;
}
} else {
throw new ERR_INVALID_ARG_TYPE('input',
['ArrayBuffer', 'ArrayBufferView'],
input);
}
input = parseInput(input);
validateObject(options, 'options', kValidateObjectAllowObjectsAndNull);

if (this[kMethod]) return this[kMethod](input);

if (this[kFlags] & CONVERTER_FLAGS_FLUSH) {
this[kBOMSeen] = false;
}
Expand Down
133 changes: 59 additions & 74 deletions src/encoding_binding.cc
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include "encoding_binding.h"
#include "encoding_singlebyte.h"
#include "ada.h"
#include "env-inl.h"
#include "node_buffer.h"
#include "node_errors.h"
#include "node_external_reference.h"
#include "simdutf.h"
Expand Down Expand Up @@ -380,6 +380,62 @@ void BindingData::DecodeUTF8(const FunctionCallbackInfo<Value>& args) {
}
}

void BindingData::DecodeSingleByte(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);

CHECK_GE(args.Length(), 2);

if (!(args[0]->IsArrayBuffer() || args[0]->IsSharedArrayBuffer() ||
args[0]->IsArrayBufferView())) {
return node::THROW_ERR_INVALID_ARG_TYPE(
env->isolate(),
"The \"list\" argument must be an instance of SharedArrayBuffer, "
"ArrayBuffer or ArrayBufferView.");
}

CHECK(args[1]->IsInt32());
const int encoding = args[1].As<v8::Int32>()->Value();
CHECK(encoding >= 0 && encoding < 29);

ArrayBufferViewContents<uint8_t> buffer(args[0]);
const uint8_t* data = buffer.data();
size_t length = buffer.length();

if (length == 0) return args.GetReturnValue().SetEmptyString();

if (!simdutf::validate_ascii_with_errors(reinterpret_cast<const char*>(data), length).error) {
Local<Value> ret;
if (StringBytes::Encode(env->isolate(), reinterpret_cast<const char*>(data), length, LATIN1).ToLocal(&ret)) {
args.GetReturnValue().Set(ret);
}
return;
}

uint16_t* dst = node::UncheckedMalloc<uint16_t>(length);

if (encoding == 28) {
// x-user-defined
for (size_t i = 0; i < length; i++) dst[i] = data[i] >= 0x80 ? data[i] + 0xf700 : data[i];
} else {
bool has_fatal = args[2]->IsTrue();

const uint16_t* table = tSingleByteEncodings[encoding];
for (size_t i = 0; i < length; i++) dst[i] = table[data[i]];

if (has_fatal && fSingleByteEncodings[encoding] &&
simdutf::find(reinterpret_cast<char16_t*>(dst), reinterpret_cast<char16_t*>(dst) + length, 0xfffd) != reinterpret_cast<char16_t*>(dst) + length
) {
return node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA(
env->isolate(), "The encoded data was not valid for this encoding");
}
}

Local<Value> ret;
if (StringBytes::Raw(env->isolate(), dst, length).ToLocal(&ret)) {
args.GetReturnValue().Set(ret);
}
}

void BindingData::ToASCII(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_GE(args.Length(), 1);
Expand Down Expand Up @@ -412,10 +468,9 @@ void BindingData::CreatePerIsolateProperties(IsolateData* isolate_data,
SetMethod(isolate, target, "encodeInto", EncodeInto);
SetMethodNoSideEffect(isolate, target, "encodeUtf8String", EncodeUtf8String);
SetMethodNoSideEffect(isolate, target, "decodeUTF8", DecodeUTF8);
SetMethodNoSideEffect(isolate, target, "decodeSingleByte", DecodeSingleByte);
SetMethodNoSideEffect(isolate, target, "toASCII", ToASCII);
SetMethodNoSideEffect(isolate, target, "toUnicode", ToUnicode);
SetMethodNoSideEffect(
isolate, target, "decodeWindows1252", DecodeWindows1252);
}

void BindingData::CreatePerContextProperties(Local<Object> target,
Expand All @@ -431,79 +486,9 @@ void BindingData::RegisterTimerExternalReferences(
registry->Register(EncodeInto);
registry->Register(EncodeUtf8String);
registry->Register(DecodeUTF8);
registry->Register(DecodeSingleByte);
registry->Register(ToASCII);
registry->Register(ToUnicode);
registry->Register(DecodeWindows1252);
}

void BindingData::DecodeWindows1252(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);

CHECK_GE(args.Length(), 1);
if (!(args[0]->IsArrayBuffer() || args[0]->IsSharedArrayBuffer() ||
args[0]->IsArrayBufferView())) {
return node::THROW_ERR_INVALID_ARG_TYPE(
env->isolate(),
"The \"input\" argument must be an instance of ArrayBuffer, "
"SharedArrayBuffer, or ArrayBufferView.");
}

bool ignore_bom = args[1]->IsTrue();

ArrayBufferViewContents<uint8_t> buffer(args[0]);
const uint8_t* data = buffer.data();
size_t length = buffer.length();

if (ignore_bom && length > 0 && data[0] == 0xFF) {
data++;
length--;
}

if (length == 0) {
return args.GetReturnValue().SetEmptyString();
}

// Windows-1252 specific mapping for bytes 128-159
// These differ from Latin-1/ISO-8859-1
static const uint16_t windows1252_mapping[32] = {
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F
};

std::string result;
result.reserve(length * 3); // Reserve space for UTF-8 output

for (size_t i = 0; i < length; i++) {
uint8_t byte = data[i];
uint32_t codepoint;

// Check if byte is in the special Windows-1252 range (128-159)
if (byte >= 0x80 && byte <= 0x9F) {
codepoint = windows1252_mapping[byte - 0x80];
} else {
// For all other bytes, Windows-1252 is identical to Latin-1
codepoint = byte;
}

// Convert codepoint to UTF-8
if (codepoint < 0x80) {
result.push_back(static_cast<char>(codepoint));
} else if (codepoint < 0x800) {
result.push_back(static_cast<char>(0xC0 | (codepoint >> 6)));
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
} else {
result.push_back(static_cast<char>(0xE0 | (codepoint >> 12)));
result.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
}
}

Local<Value> ret;
if (ToV8Value(env->context(), result, env->isolate()).ToLocal(&ret)) {
args.GetReturnValue().Set(ret);
}
}

} // namespace encoding_binding
Expand Down
3 changes: 1 addition & 2 deletions src/encoding_binding.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@ class BindingData : public SnapshotableObject {
static void EncodeInto(const v8::FunctionCallbackInfo<v8::Value>& args);
static void EncodeUtf8String(const v8::FunctionCallbackInfo<v8::Value>& args);
static void DecodeUTF8(const v8::FunctionCallbackInfo<v8::Value>& args);
static void DecodeWindows1252(
const v8::FunctionCallbackInfo<v8::Value>& args);
static void DecodeSingleByte(const v8::FunctionCallbackInfo<v8::Value>& args);

static void ToASCII(const v8::FunctionCallbackInfo<v8::Value>& args);
static void ToUnicode(const v8::FunctionCallbackInfo<v8::Value>& args);
Expand Down
Loading