src: remove icu based ToASCII and ToUnicode

PR-URL: https://github.com/nodejs/node/pull/55156 Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Matthew Aitken <maitken033380023@gmail.com> Reviewed-By: Daniel Lemire <daniel@lemire.me> Reviewed-By: Richard Lau <rlau@redhat.com>
2025-12-28 16:07:39 +00:00 · 2024-10-18 12:47:50 -04:00 · 2024-10-18 12:47:50 -04:00 · 9f5000e0f2
commit 9f5000e0f2
parent 7c0cc12f0b
4 changed files with 2 additions and 389 deletions
--- a/src/node_i18n.cc
+++ b/src/node_i18n.cc
@ -60,19 +60,17 @@
 #include <unicode/uchar.h>
 #include <unicode/uclean.h>
 #include <unicode/ucnv.h>
-#include <unicode/udata.h>
-#include <unicode/uidna.h>
 #include <unicode/ulocdata.h>
 #include <unicode/urename.h>
-#include <unicode/ustring.h>
 #include <unicode/utf16.h>
-#include <unicode/utf8.h>
 #include <unicode/utypes.h>
 #include <unicode/uvernum.h>
 #include <unicode/uversion.h>
 #include "nbytes.h"

 #ifdef NODE_HAVE_SMALL_ICU
+#include <unicode/udata.h>
+
 /* if this is defined, we have a 'secondary' entry point.
   compare following to utypes.h defs for U_ICUDATA_ENTRY_POINT */
 #define SMALL_ICUDATA_ENTRY_POINT \
@ -96,7 +94,6 @@ using v8::Int32;
 using v8::Isolate;
 using v8::Local;
 using v8::MaybeLocal;
-using v8::NewStringType;
 using v8::Object;
 using v8::ObjectTemplate;
 using v8::String;
@ -583,167 +580,6 @@ void SetDefaultTimeZone(const char* tzid) {
  CHECK(U_SUCCESS(status));
 }

-int32_t ToUnicode(MaybeStackBuffer<char>* buf,
-                  const char* input,
-                  size_t length) {
-  UErrorCode status = U_ZERO_ERROR;
-  uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE;
-  UIDNA* uidna = uidna_openUTS46(options, &status);
-  if (U_FAILURE(status))
-    return -1;
-  UIDNAInfo info = UIDNA_INFO_INITIALIZER;
-
-  int32_t len = uidna_nameToUnicodeUTF8(uidna,
-                                        input, length,
-                                        **buf, buf->capacity(),
-                                        &info,
-                                        &status);
-
-  // Do not check info.errors like we do with ToASCII since ToUnicode always
-  // returns a string, despite any possible errors that may have occurred.
-
-  if (status == U_BUFFER_OVERFLOW_ERROR) {
-    status = U_ZERO_ERROR;
-    buf->AllocateSufficientStorage(len);
-    len = uidna_nameToUnicodeUTF8(uidna,
-                                  input, length,
-                                  **buf, buf->capacity(),
-                                  &info,
-                                  &status);
-  }
-
-  // info.errors is ignored as UTS #46 ToUnicode always produces a Unicode
-  // string, regardless of whether an error occurred.
-
-  if (U_FAILURE(status)) {
-    len = -1;
-    buf->SetLength(0);
-  } else {
-    buf->SetLength(len);
-  }
-
-  uidna_close(uidna);
-  return len;
-}
-
-int32_t ToASCII(MaybeStackBuffer<char>* buf,
-                const char* input,
-                size_t length,
-                idna_mode mode) {
-  UErrorCode status = U_ZERO_ERROR;
-  uint32_t options =                  // CheckHyphens = false; handled later
-    UIDNA_CHECK_BIDI |                // CheckBidi = true
-    UIDNA_CHECK_CONTEXTJ |            // CheckJoiners = true
-    UIDNA_NONTRANSITIONAL_TO_ASCII;   // Nontransitional_Processing
-  if (mode == idna_mode::kStrict) {
-    options |= UIDNA_USE_STD3_RULES;  // UseSTD3ASCIIRules = beStrict
-                                      // VerifyDnsLength = beStrict;
-                                      //   handled later
-  }
-
-  UIDNA* uidna = uidna_openUTS46(options, &status);
-  if (U_FAILURE(status))
-    return -1;
-  UIDNAInfo info = UIDNA_INFO_INITIALIZER;
-
-  int32_t len = uidna_nameToASCII_UTF8(uidna,
-                                       input, length,
-                                       **buf, buf->capacity(),
-                                       &info,
-                                       &status);
-
-  if (status == U_BUFFER_OVERFLOW_ERROR) {
-    status = U_ZERO_ERROR;
-    buf->AllocateSufficientStorage(len);
-    len = uidna_nameToASCII_UTF8(uidna,
-                                 input, length,
-                                 **buf, buf->capacity(),
-                                 &info,
-                                 &status);
-  }
-
-  // In UTS #46 which specifies ToASCII, certain error conditions are
-  // configurable through options, and the WHATWG URL Standard promptly elects
-  // to disable some of them to accommodate for real-world use cases.
-  // Unfortunately, ICU4C's IDNA module does not support disabling some of
-  // these options through `options` above, and thus continues throwing
-  // unnecessary errors. To counter this situation, we just filter out the
-  // errors that may have happened afterwards, before deciding whether to
-  // return an error from this function.
-
-  // CheckHyphens = false
-  // (Specified in the current UTS #46 draft rev. 18.)
-  // Refs:
-  // - https://github.com/whatwg/url/issues/53
-  // - https://github.com/whatwg/url/pull/309
-  // - http://www.unicode.org/review/pri317/
-  // - http://www.unicode.org/reports/tr46/tr46-18.html
-  // - https://www.icann.org/news/announcement-2000-01-07-en
-  info.errors &= ~UIDNA_ERROR_HYPHEN_3_4;
-  info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN;
-  info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN;
-
-  if (mode != idna_mode::kStrict) {
-    // VerifyDnsLength = beStrict
-    info.errors &= ~UIDNA_ERROR_EMPTY_LABEL;
-    info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG;
-    info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
-  }
-
-  if (U_FAILURE(status) || (mode != idna_mode::kLenient && info.errors != 0)) {
-    len = -1;
-    buf->SetLength(0);
-  } else {
-    buf->SetLength(len);
-  }
-
-  uidna_close(uidna);
-  return len;
-}
-
-static void ToUnicode(const FunctionCallbackInfo<Value>& args) {
-  Environment* env = Environment::GetCurrent(args);
-  CHECK_GE(args.Length(), 1);
-  CHECK(args[0]->IsString());
-  Utf8Value val(env->isolate(), args[0]);
-
-  MaybeStackBuffer<char> buf;
-  int32_t len = ToUnicode(&buf, *val, val.length());
-
-  if (len < 0) {
-    return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to Unicode");
-  }
-
-  args.GetReturnValue().Set(
-      String::NewFromUtf8(env->isolate(),
-                          *buf,
-                          NewStringType::kNormal,
-                          len).ToLocalChecked());
-}
-
-static void ToASCII(const FunctionCallbackInfo<Value>& args) {
-  Environment* env = Environment::GetCurrent(args);
-  CHECK_GE(args.Length(), 1);
-  CHECK(args[0]->IsString());
-  Utf8Value val(env->isolate(), args[0]);
-  // optional arg
-  bool lenient = args[1]->BooleanValue(env->isolate());
-  idna_mode mode = lenient ? idna_mode::kLenient : idna_mode::kDefault;
-
-  MaybeStackBuffer<char> buf;
-  int32_t len = ToASCII(&buf, *val, val.length(), mode);
-
-  if (len < 0) {
-    return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to ASCII");
-  }
-
-  args.GetReturnValue().Set(
-      String::NewFromUtf8(env->isolate(),
-                          *buf,
-                          NewStringType::kNormal,
-                          len).ToLocalChecked());
-}
-
 // This is similar to wcwidth except that it takes the current unicode
 // character properties database into consideration, allowing it to
 // correctly calculate the column widths of things like emoji's and
@ -850,8 +686,6 @@ static void CreatePerIsolateProperties(IsolateData* isolate_data,
                                       Local<ObjectTemplate> target) {
  Isolate* isolate = isolate_data->isolate();

-  SetMethod(isolate, target, "toUnicode", ToUnicode);
-  SetMethod(isolate, target, "toASCII", ToASCII);
  SetMethod(isolate, target, "getStringWidth", GetStringWidth);

  // One-shot converters
@ -880,8 +714,6 @@ void CreatePerContextProperties(Local<Object> target,
                                void* priv) {}

 void RegisterExternalReferences(ExternalReferenceRegistry* registry) {
-  registry->Register(ToUnicode);
-  registry->Register(ToASCII);
  registry->Register(GetStringWidth);
  registry->Register(ICUErrorName);
  registry->Register(Transcode);
--- a/src/node_i18n.h
+++ b/src/node_i18n.h
@ -53,19 +53,6 @@ enum class idna_mode {
  kStrict
 };

-// Implements the WHATWG URL Standard "domain to ASCII" algorithm.
-// https://url.spec.whatwg.org/#concept-domain-to-ascii
-int32_t ToASCII(MaybeStackBuffer<char>* buf,
-                const char* input,
-                size_t length,
-                idna_mode mode = idna_mode::kDefault);
-
-// Implements the WHATWG URL Standard "domain to Unicode" algorithm.
-// https://url.spec.whatwg.org/#concept-domain-to-unicode
-int32_t ToUnicode(MaybeStackBuffer<char>* buf,
-                  const char* input,
-                  size_t length);
-
 struct ConverterDeleter {
  void operator()(UConverter* pointer) const { ucnv_close(pointer); }
 };
--- a/test/fixtures/icu-punycode-toascii.json
+++ b/test/fixtures/icu-punycode-toascii.json
@ -1,149 +0,0 @@
-[
-  "This resource is focused on highlighting issues with UTS #46 ToASCII",
-  {
-    "comment": "Label with hyphens in 3rd and 4th position",
-    "input": "aa--",
-    "output": "aa--"
-  },
-  {
-    "input": "a†--",
-    "output": "xn--a---kp0a"
-  },
-  {
-    "input": "ab--c",
-    "output": "ab--c"
-  },
-  {
-    "comment": "Label with leading hyphen",
-    "input": "-x",
-    "output": "-x"
-  },
-  {
-    "input": "-†",
-    "output": "xn----xhn"
-  },
-  {
-    "input": "-x.xn--nxa",
-    "output": "-x.xn--nxa"
-  },
-  {
-    "input": "-x.β",
-    "output": "-x.xn--nxa"
-  },
-  {
-    "comment": "Label with trailing hyphen",
-    "input": "x-.xn--nxa",
-    "output": "x-.xn--nxa"
-  },
-  {
-    "input": "x-.β",
-    "output": "x-.xn--nxa"
-  },
-  {
-    "comment": "Empty labels",
-    "input": "x..xn--nxa",
-    "output": "x..xn--nxa"
-  },
-  {
-    "input": "x..β",
-    "output": "x..xn--nxa"
-  },
-  {
-    "comment": "Invalid Punycode",
-    "input": "xn--a",
-    "output": null
-  },
-  {
-    "input": "xn--a.xn--nxa",
-    "output": null
-  },
-  {
-    "input": "xn--a.β",
-    "output": null
-  },
-  {
-    "comment": "Valid Punycode",
-    "input": "xn--nxa.xn--nxa",
-    "output": "xn--nxa.xn--nxa"
-  },
-  {
-    "comment": "Mixed",
-    "input": "xn--nxa.β",
-    "output": "xn--nxa.xn--nxa"
-  },
-  {
-    "input": "ab--c.xn--nxa",
-    "output": "ab--c.xn--nxa"
-  },
-  {
-    "input": "ab--c.β",
-    "output": "ab--c.xn--nxa"
-  },
-  {
-    "comment": "CheckJoiners is true",
-    "input": "\u200D.example",
-    "output": null
-  },
-  {
-    "input": "xn--1ug.example",
-    "output": null
-  },
-  {
-    "comment": "CheckBidi is true",
-    "input": "يa",
-    "output": null
-  },
-  {
-    "input": "xn--a-yoc",
-    "output": null
-  },
-  {
-    "comment": "processing_option is Nontransitional_Processing",
-    "input": "ශ්‍රී",
-    "output": "xn--10cl1a0b660p"
-  },
-  {
-    "input": "نامه‌ای",
-    "output": "xn--mgba3gch31f060k"
-  },
-  {
-    "comment": "U+FFFD",
-    "input": "\uFFFD.com",
-    "output": null
-  },
-  {
-    "comment": "U+FFFD character encoded in Punycode",
-    "input": "xn--zn7c.com",
-    "output": null
-  },
-  {
-    "comment": "Label longer than 63 code points",
-    "input": "x01234567890123456789012345678901234567890123456789012345678901x",
-    "output": "x01234567890123456789012345678901234567890123456789012345678901x"
-  },
-  {
-    "input": "x01234567890123456789012345678901234567890123456789012345678901†",
-    "output": "xn--x01234567890123456789012345678901234567890123456789012345678901-6963b"
-  },
-  {
-    "input": "x01234567890123456789012345678901234567890123456789012345678901x.xn--nxa",
-    "output": "x01234567890123456789012345678901234567890123456789012345678901x.xn--nxa"
-  },
-  {
-    "input": "x01234567890123456789012345678901234567890123456789012345678901x.β",
-    "output": "x01234567890123456789012345678901234567890123456789012345678901x.xn--nxa"
-  },
-  {
-    "comment": "Domain excluding TLD longer than 253 code points",
-    "input": "01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.0123456789012345678901234567890123456789012345678.x",
-    "output": "01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.0123456789012345678901234567890123456789012345678.x"
-  },
-  {
-    "input": "01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.0123456789012345678901234567890123456789012345678.xn--nxa",
-    "output": "01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.0123456789012345678901234567890123456789012345678.xn--nxa"
-  },
-  {
-    "input": "01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.0123456789012345678901234567890123456789012345678.β",
-    "output": "01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.0123456789012345678901234567890123456789012345678.xn--nxa"
-  }
-]
--- a/test/parallel/test-icu-punycode.js
+++ b/test/parallel/test-icu-punycode.js
@ -1,57 +0,0 @@
-'use strict';
-// Flags: --expose-internals
-const common = require('../common');
-
-if (!common.hasIntl)
-  common.skip('missing Intl');
-
-const { internalBinding } = require('internal/test/binding');
-const icu = internalBinding('icu');
-const assert = require('assert');
-
-// Test hasConverter method
-assert(icu.hasConverter('utf-8'),
-       'hasConverter should report converter exists for utf-8');
-assert(!icu.hasConverter('x'),
-       'hasConverter should report converter does not exist for x');
-
-const tests = require('../fixtures/url-idna.js');
-const fixtures = require('../fixtures/icu-punycode-toascii.json');
-
-{
-  for (const [i, { ascii, unicode }] of tests.entries()) {
-    assert.strictEqual(ascii, icu.toASCII(unicode), `toASCII(${i + 1})`);
-    assert.strictEqual(unicode, icu.toUnicode(ascii), `toUnicode(${i + 1})`);
-    assert.strictEqual(ascii, icu.toASCII(icu.toUnicode(ascii)),
-                       `toASCII(toUnicode(${i + 1}))`);
-    assert.strictEqual(unicode, icu.toUnicode(icu.toASCII(unicode)),
-                       `toUnicode(toASCII(${i + 1}))`);
-  }
-}
-
-{
-  for (const [i, test] of fixtures.entries()) {
-    if (typeof test === 'string')
-      continue; // skip comments
-    const { comment, input, output } = test;
-    let caseComment = `case ${i + 1}`;
-    if (comment)
-      caseComment += ` (${comment})`;
-    if (output === null) {
-      assert.throws(
-        () => icu.toASCII(input),
-        {
-          code: 'ERR_INVALID_ARG_VALUE',
-          name: 'TypeError',
-          message: 'Cannot convert name to ASCII'
-        }
-      );
-      icu.toASCII(input, true); // Should not throw.
-    } else {
-      assert.strictEqual(icu.toASCII(input), output, `ToASCII ${caseComment}`);
-      assert.strictEqual(icu.toASCII(input, true), output,
-                         `ToASCII ${caseComment} in lenient mode`);
-    }
-    icu.toUnicode(input); // Should not throw.
-  }
-}