Skip to content

Commit 30672f4

Browse files
ChALkeRmertcanaltin
andcommitted
src: move all 1-byte encodings to native
Co-authored-by: Mert Can Altin <mertgold60@gmail.com>
1 parent e155415 commit 30672f4

File tree

11 files changed

+595
-159
lines changed

11 files changed

+595
-159
lines changed

lib/internal/encoding.js

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@
44
// https://encoding.spec.whatwg.org
55

66
const {
7+
ArrayPrototypeMap,
78
Boolean,
89
ObjectDefineProperties,
910
ObjectGetOwnPropertyDescriptors,
1011
ObjectSetPrototypeOf,
1112
ObjectValues,
13+
SafeArrayIterator,
1214
SafeMap,
1315
StringPrototypeSlice,
1416
Symbol,
@@ -32,8 +34,6 @@ const kFatal = Symbol('kFatal');
3234
const kUTF8FastPath = Symbol('kUTF8FastPath');
3335
const kIgnoreBOM = Symbol('kIgnoreBOM');
3436

35-
const { isSinglebyteEncoding, createSinglebyteDecoder } = require('internal/encoding/single-byte');
36-
3737
const {
3838
getConstructorOf,
3939
customInspectSymbol: inspect,
@@ -60,6 +60,7 @@ const {
6060
encodeIntoResults,
6161
encodeUtf8String,
6262
decodeUTF8,
63+
decodeSingleByte,
6364
} = binding;
6465

6566
function validateDecoder(obj) {
@@ -73,6 +74,47 @@ const CONVERTER_FLAGS_IGNORE_BOM = 0x4;
7374

7475
const empty = new FastBuffer();
7576

77+
// Has to be synced with src/
78+
const encodingsSinglebyte = new SafeMap(new SafeArrayIterator(ArrayPrototypeMap([
79+
'ibm866',
80+
'koi8-r',
81+
'koi8-u',
82+
'macintosh',
83+
'x-mac-cyrillic',
84+
'iso-8859-2',
85+
'iso-8859-3',
86+
'iso-8859-4',
87+
'iso-8859-5',
88+
'iso-8859-6',
89+
'iso-8859-7',
90+
'iso-8859-8',
91+
'iso-8859-8-i',
92+
'iso-8859-10',
93+
'iso-8859-13',
94+
'iso-8859-14',
95+
'iso-8859-15',
96+
'iso-8859-16',
97+
'windows-874',
98+
'windows-1250',
99+
'windows-1251',
100+
'windows-1252',
101+
'windows-1253',
102+
'windows-1254',
103+
'windows-1255',
104+
'windows-1256',
105+
'windows-1257',
106+
'windows-1258',
107+
'x-user-defined', // Has to be last, special case
108+
], (e, i) => [e, i])));
109+
110+
const isSinglebyteEncoding = (enc) => encodingsSinglebyte.has(enc);
111+
112+
function createSinglebyteDecoder(encoding, fatal) {
113+
const key = encodingsSinglebyte.get(encoding);
114+
if (key === undefined) throw new ERR_ENCODING_NOT_SUPPORTED(encoding);
115+
return (buf) => decodeSingleByte(buf, key, fatal);
116+
}
117+
76118
const encodings = new SafeMap([
77119
['unicode-1-1-utf-8', 'utf-8'],
78120
['unicode11utf8', 'utf-8'],
@@ -479,7 +521,7 @@ class TextDecoder {
479521
validateDecoder(this);
480522
validateObject(options, 'options', kValidateObjectAllowObjectsAndNull);
481523

482-
if (this[kSingleByte]) return this[kSingleByte](parseInput(input));
524+
if (this[kSingleByte]) return this[kSingleByte](input);
483525

484526
const stream = options?.stream;
485527
if (this[kUTF8FastPath]) {

lib/internal/encoding/single-byte.js

Lines changed: 0 additions & 155 deletions
This file was deleted.

lib/internal/encoding/util.js

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
// From https://npmjs.com/package/@exodus/bytes
2+
// Copyright Exodus Movement. Licensed under MIT License.
3+
4+
'use strict';
5+
6+
// Get a number of last bytes in an Uint8Array `u` ending at `len` that don't
7+
// form a codepoint yet, but can be a part of a single codepoint on more data
8+
function unfinishedBytesUtf8(u, len) {
9+
// 0-3
10+
let p = 0
11+
while (p < 2 && p < len && (u[len - p - 1] & 0xc0) === 0x80) p++ // go back 0-2 trailing bytes
12+
if (p === len) return 0 // no space for lead
13+
const l = u[len - p - 1]
14+
if (l < 0xc2 || l > 0xf4) return 0 // not a lead
15+
if (p === 0) return 1 // nothing to recheck, we have only lead, return it. 2-byte must return here
16+
if (l < 0xe0 || (l < 0xf0 && p >= 2)) return 0 // 2-byte, or 3-byte or less and we already have 2 trailing
17+
const lower = l === 0xf0 ? 0x90 : l === 0xe0 ? 0xa0 : 0x80
18+
const upper = l === 0xf4 ? 0x8f : l === 0xed ? 0x9f : 0xbf
19+
const n = u[len - p]
20+
return n >= lower && n <= upper ? p + 1 : 0
21+
}
22+
23+
// Merge prefix `chunk` with `u` and return new combined prefix
24+
// For u.length < 3, fully consumes u and can return unfinished data,
25+
// otherwise returns a prefix with no unfinished bytes
26+
function mergePrefixUtf8(u, chunk) {
27+
if (u.length === 0) return chunk
28+
if (u.length < 3) {
29+
// No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence
30+
const a = new Uint8Array(u.length + chunk.length)
31+
a.set(chunk)
32+
a.set(u, chunk.length)
33+
return a
34+
}
35+
36+
// Slice off a small portion of u into prefix chunk so we can decode them separately without extending array size
37+
const t = new Uint8Array(chunk.length + 3) // We have 1-3 bytes and need 1-3 more bytes
38+
t.set(chunk)
39+
t.set(u.subarray(0, 3), chunk.length)
40+
41+
// Stop at the first offset where unfinished bytes reaches 0 or fits into u
42+
// If that doesn't happen (u too short), just concat chunk and u completely (above)
43+
for (let i = 1; i <= 3; i++) {
44+
const unfinished = unfinishedBytesUtf8(t, chunk.length + i) // 0-3
45+
if (unfinished <= i) {
46+
// Always reachable at 3, but we still need 'unfinished' value for it
47+
const add = i - unfinished // 0-3
48+
return add > 0 ? t.subarray(0, chunk.length + add) : chunk
49+
}
50+
}
51+
52+
// Unreachable
53+
}
54+
55+
module.exports = { unfinishedBytesUtf8, mergePrefixUtf8 }

node.gyp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@
8989
'src/debug_utils.cc',
9090
'src/embedded_data.cc',
9191
'src/encoding_binding.cc',
92+
'src/encoding_singlebyte.cc',
9293
'src/env.cc',
9394
'src/fs_event_wrap.cc',
9495
'src/handle_wrap.cc',
@@ -221,6 +222,7 @@
221222
'src/debug_utils-inl.h',
222223
'src/embedded_data.h',
223224
'src/encoding_binding.h',
225+
'src/encoding_singlebyte.h',
224226
'src/env_properties.h',
225227
'src/env.h',
226228
'src/env-inl.h',

0 commit comments

Comments
 (0)