src: remove icu based ToASCII and ToUnicode
Some checks are pending
Coverage Linux (without intl) / coverage-linux-without-intl (push) Waiting to run
Coverage Linux / coverage-linux (push) Waiting to run
Coverage Windows / coverage-windows (push) Waiting to run
Test and upload documentation to artifacts / build-docs (push) Waiting to run
Linters / lint-addon-docs (push) Waiting to run
Linters / lint-cpp (push) Waiting to run
Linters / format-cpp (push) Waiting to run
Linters / lint-js-and-md (push) Waiting to run
Linters / lint-py (push) Waiting to run
Linters / lint-yaml (push) Waiting to run
Linters / lint-sh (push) Waiting to run
Linters / lint-codeowners (push) Waiting to run
Linters / lint-pr-url (push) Waiting to run
Linters / lint-readme (push) Waiting to run
Notify on Push / Notify on Force Push on `main` (push) Waiting to run
Notify on Push / Notify on Push on `main` that lacks metadata (push) Waiting to run
Scorecard supply-chain security / Scorecard analysis (push) Waiting to run

PR-URL: https://github.com/nodejs/node/pull/55156
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Matthew Aitken <maitken033380023@gmail.com>
Reviewed-By: Daniel Lemire <daniel@lemire.me>
Reviewed-By: Richard Lau <rlau@redhat.com>
This commit is contained in:
Yagiz Nizipli 2024-10-18 12:47:50 -04:00 committed by GitHub
parent 7c0cc12f0b
commit 9f5000e0f2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 2 additions and 389 deletions

View File

@ -60,19 +60,17 @@
#include <unicode/uchar.h>
#include <unicode/uclean.h>
#include <unicode/ucnv.h>
#include <unicode/udata.h>
#include <unicode/uidna.h>
#include <unicode/ulocdata.h>
#include <unicode/urename.h>
#include <unicode/ustring.h>
#include <unicode/utf16.h>
#include <unicode/utf8.h>
#include <unicode/utypes.h>
#include <unicode/uvernum.h>
#include <unicode/uversion.h>
#include "nbytes.h"
#ifdef NODE_HAVE_SMALL_ICU
#include <unicode/udata.h>
/* if this is defined, we have a 'secondary' entry point.
compare following to utypes.h defs for U_ICUDATA_ENTRY_POINT */
#define SMALL_ICUDATA_ENTRY_POINT \
@ -96,7 +94,6 @@ using v8::Int32;
using v8::Isolate;
using v8::Local;
using v8::MaybeLocal;
using v8::NewStringType;
using v8::Object;
using v8::ObjectTemplate;
using v8::String;
@ -583,167 +580,6 @@ void SetDefaultTimeZone(const char* tzid) {
CHECK(U_SUCCESS(status));
}
int32_t ToUnicode(MaybeStackBuffer<char>* buf,
const char* input,
size_t length) {
UErrorCode status = U_ZERO_ERROR;
uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE;
UIDNA* uidna = uidna_openUTS46(options, &status);
if (U_FAILURE(status))
return -1;
UIDNAInfo info = UIDNA_INFO_INITIALIZER;
int32_t len = uidna_nameToUnicodeUTF8(uidna,
input, length,
**buf, buf->capacity(),
&info,
&status);
// Do not check info.errors like we do with ToASCII since ToUnicode always
// returns a string, despite any possible errors that may have occurred.
if (status == U_BUFFER_OVERFLOW_ERROR) {
status = U_ZERO_ERROR;
buf->AllocateSufficientStorage(len);
len = uidna_nameToUnicodeUTF8(uidna,
input, length,
**buf, buf->capacity(),
&info,
&status);
}
// info.errors is ignored as UTS #46 ToUnicode always produces a Unicode
// string, regardless of whether an error occurred.
if (U_FAILURE(status)) {
len = -1;
buf->SetLength(0);
} else {
buf->SetLength(len);
}
uidna_close(uidna);
return len;
}
int32_t ToASCII(MaybeStackBuffer<char>* buf,
const char* input,
size_t length,
idna_mode mode) {
UErrorCode status = U_ZERO_ERROR;
uint32_t options = // CheckHyphens = false; handled later
UIDNA_CHECK_BIDI | // CheckBidi = true
UIDNA_CHECK_CONTEXTJ | // CheckJoiners = true
UIDNA_NONTRANSITIONAL_TO_ASCII; // Nontransitional_Processing
if (mode == idna_mode::kStrict) {
options |= UIDNA_USE_STD3_RULES; // UseSTD3ASCIIRules = beStrict
// VerifyDnsLength = beStrict;
// handled later
}
UIDNA* uidna = uidna_openUTS46(options, &status);
if (U_FAILURE(status))
return -1;
UIDNAInfo info = UIDNA_INFO_INITIALIZER;
int32_t len = uidna_nameToASCII_UTF8(uidna,
input, length,
**buf, buf->capacity(),
&info,
&status);
if (status == U_BUFFER_OVERFLOW_ERROR) {
status = U_ZERO_ERROR;
buf->AllocateSufficientStorage(len);
len = uidna_nameToASCII_UTF8(uidna,
input, length,
**buf, buf->capacity(),
&info,
&status);
}
// In UTS #46 which specifies ToASCII, certain error conditions are
// configurable through options, and the WHATWG URL Standard promptly elects
// to disable some of them to accommodate for real-world use cases.
// Unfortunately, ICU4C's IDNA module does not support disabling some of
// these options through `options` above, and thus continues throwing
// unnecessary errors. To counter this situation, we just filter out the
// errors that may have happened afterwards, before deciding whether to
// return an error from this function.
// CheckHyphens = false
// (Specified in the current UTS #46 draft rev. 18.)
// Refs:
// - https://github.com/whatwg/url/issues/53
// - https://github.com/whatwg/url/pull/309
// - http://www.unicode.org/review/pri317/
// - http://www.unicode.org/reports/tr46/tr46-18.html
// - https://www.icann.org/news/announcement-2000-01-07-en
info.errors &= ~UIDNA_ERROR_HYPHEN_3_4;
info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN;
info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN;
if (mode != idna_mode::kStrict) {
// VerifyDnsLength = beStrict
info.errors &= ~UIDNA_ERROR_EMPTY_LABEL;
info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG;
info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
}
if (U_FAILURE(status) || (mode != idna_mode::kLenient && info.errors != 0)) {
len = -1;
buf->SetLength(0);
} else {
buf->SetLength(len);
}
uidna_close(uidna);
return len;
}
static void ToUnicode(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_GE(args.Length(), 1);
CHECK(args[0]->IsString());
Utf8Value val(env->isolate(), args[0]);
MaybeStackBuffer<char> buf;
int32_t len = ToUnicode(&buf, *val, val.length());
if (len < 0) {
return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to Unicode");
}
args.GetReturnValue().Set(
String::NewFromUtf8(env->isolate(),
*buf,
NewStringType::kNormal,
len).ToLocalChecked());
}
static void ToASCII(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_GE(args.Length(), 1);
CHECK(args[0]->IsString());
Utf8Value val(env->isolate(), args[0]);
// optional arg
bool lenient = args[1]->BooleanValue(env->isolate());
idna_mode mode = lenient ? idna_mode::kLenient : idna_mode::kDefault;
MaybeStackBuffer<char> buf;
int32_t len = ToASCII(&buf, *val, val.length(), mode);
if (len < 0) {
return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to ASCII");
}
args.GetReturnValue().Set(
String::NewFromUtf8(env->isolate(),
*buf,
NewStringType::kNormal,
len).ToLocalChecked());
}
// This is similar to wcwidth except that it takes the current unicode
// character properties database into consideration, allowing it to
// correctly calculate the column widths of things like emoji's and
@ -850,8 +686,6 @@ static void CreatePerIsolateProperties(IsolateData* isolate_data,
Local<ObjectTemplate> target) {
Isolate* isolate = isolate_data->isolate();
SetMethod(isolate, target, "toUnicode", ToUnicode);
SetMethod(isolate, target, "toASCII", ToASCII);
SetMethod(isolate, target, "getStringWidth", GetStringWidth);
// One-shot converters
@ -880,8 +714,6 @@ void CreatePerContextProperties(Local<Object> target,
void* priv) {}
void RegisterExternalReferences(ExternalReferenceRegistry* registry) {
registry->Register(ToUnicode);
registry->Register(ToASCII);
registry->Register(GetStringWidth);
registry->Register(ICUErrorName);
registry->Register(Transcode);

View File

@ -53,19 +53,6 @@ enum class idna_mode {
kStrict
};
// Implements the WHATWG URL Standard "domain to ASCII" algorithm.
// https://url.spec.whatwg.org/#concept-domain-to-ascii
int32_t ToASCII(MaybeStackBuffer<char>* buf,
const char* input,
size_t length,
idna_mode mode = idna_mode::kDefault);
// Implements the WHATWG URL Standard "domain to Unicode" algorithm.
// https://url.spec.whatwg.org/#concept-domain-to-unicode
int32_t ToUnicode(MaybeStackBuffer<char>* buf,
const char* input,
size_t length);
struct ConverterDeleter {
void operator()(UConverter* pointer) const { ucnv_close(pointer); }
};

View File

@ -1,149 +0,0 @@
[
"This resource is focused on highlighting issues with UTS #46 ToASCII",
{
"comment": "Label with hyphens in 3rd and 4th position",
"input": "aa--",
"output": "aa--"
},
{
"input": "a†--",
"output": "xn--a---kp0a"
},
{
"input": "ab--c",
"output": "ab--c"
},
{
"comment": "Label with leading hyphen",
"input": "-x",
"output": "-x"
},
{
"input": "-†",
"output": "xn----xhn"
},
{
"input": "-x.xn--nxa",
"output": "-x.xn--nxa"
},
{
"input": "-x.β",
"output": "-x.xn--nxa"
},
{
"comment": "Label with trailing hyphen",
"input": "x-.xn--nxa",
"output": "x-.xn--nxa"
},
{
"input": "x-.β",
"output": "x-.xn--nxa"
},
{
"comment": "Empty labels",
"input": "x..xn--nxa",
"output": "x..xn--nxa"
},
{
"input": "x..β",
"output": "x..xn--nxa"
},
{
"comment": "Invalid Punycode",
"input": "xn--a",
"output": null
},
{
"input": "xn--a.xn--nxa",
"output": null
},
{
"input": "xn--a.β",
"output": null
},
{
"comment": "Valid Punycode",
"input": "xn--nxa.xn--nxa",
"output": "xn--nxa.xn--nxa"
},
{
"comment": "Mixed",
"input": "xn--nxa.β",
"output": "xn--nxa.xn--nxa"
},
{
"input": "ab--c.xn--nxa",
"output": "ab--c.xn--nxa"
},
{
"input": "ab--c.β",
"output": "ab--c.xn--nxa"
},
{
"comment": "CheckJoiners is true",
"input": "\u200D.example",
"output": null
},
{
"input": "xn--1ug.example",
"output": null
},
{
"comment": "CheckBidi is true",
"input": "يa",
"output": null
},
{
"input": "xn--a-yoc",
"output": null
},
{
"comment": "processing_option is Nontransitional_Processing",
"input": "ශ්‍රී",
"output": "xn--10cl1a0b660p"
},
{
"input": "نامه‌ای",
"output": "xn--mgba3gch31f060k"
},
{
"comment": "U+FFFD",
"input": "\uFFFD.com",
"output": null
},
{
"comment": "U+FFFD character encoded in Punycode",
"input": "xn--zn7c.com",
"output": null
},
{
"comment": "Label longer than 63 code points",
"input": "x01234567890123456789012345678901234567890123456789012345678901x",
"output": "x01234567890123456789012345678901234567890123456789012345678901x"
},
{
"input": "x01234567890123456789012345678901234567890123456789012345678901†",
"output": "xn--x01234567890123456789012345678901234567890123456789012345678901-6963b"
},
{
"input": "x01234567890123456789012345678901234567890123456789012345678901x.xn--nxa",
"output": "x01234567890123456789012345678901234567890123456789012345678901x.xn--nxa"
},
{
"input": "x01234567890123456789012345678901234567890123456789012345678901x.β",
"output": "x01234567890123456789012345678901234567890123456789012345678901x.xn--nxa"
},
{
"comment": "Domain excluding TLD longer than 253 code points",
"input": "01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.0123456789012345678901234567890123456789012345678.x",
"output": "01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.0123456789012345678901234567890123456789012345678.x"
},
{
"input": "01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.0123456789012345678901234567890123456789012345678.xn--nxa",
"output": "01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.0123456789012345678901234567890123456789012345678.xn--nxa"
},
{
"input": "01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.0123456789012345678901234567890123456789012345678.β",
"output": "01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.0123456789012345678901234567890123456789012345678.xn--nxa"
}
]

View File

@ -1,57 +0,0 @@
'use strict';
// Flags: --expose-internals
const common = require('../common');
if (!common.hasIntl)
common.skip('missing Intl');
const { internalBinding } = require('internal/test/binding');
const icu = internalBinding('icu');
const assert = require('assert');
// Test hasConverter method
assert(icu.hasConverter('utf-8'),
'hasConverter should report converter exists for utf-8');
assert(!icu.hasConverter('x'),
'hasConverter should report converter does not exist for x');
const tests = require('../fixtures/url-idna.js');
const fixtures = require('../fixtures/icu-punycode-toascii.json');
{
for (const [i, { ascii, unicode }] of tests.entries()) {
assert.strictEqual(ascii, icu.toASCII(unicode), `toASCII(${i + 1})`);
assert.strictEqual(unicode, icu.toUnicode(ascii), `toUnicode(${i + 1})`);
assert.strictEqual(ascii, icu.toASCII(icu.toUnicode(ascii)),
`toASCII(toUnicode(${i + 1}))`);
assert.strictEqual(unicode, icu.toUnicode(icu.toASCII(unicode)),
`toUnicode(toASCII(${i + 1}))`);
}
}
{
for (const [i, test] of fixtures.entries()) {
if (typeof test === 'string')
continue; // skip comments
const { comment, input, output } = test;
let caseComment = `case ${i + 1}`;
if (comment)
caseComment += ` (${comment})`;
if (output === null) {
assert.throws(
() => icu.toASCII(input),
{
code: 'ERR_INVALID_ARG_VALUE',
name: 'TypeError',
message: 'Cannot convert name to ASCII'
}
);
icu.toASCII(input, true); // Should not throw.
} else {
assert.strictEqual(icu.toASCII(input), output, `ToASCII ${caseComment}`);
assert.strictEqual(icu.toASCII(input, true), output,
`ToASCII ${caseComment} in lenient mode`);
}
icu.toUnicode(input); // Should not throw.
}
}