diff --git a/third_party/blink/renderer/platform/BUILD.gn b/third_party/blink/renderer/platform/BUILD.gn index 2c079c06c5..34a844659b 100644 --- a/third_party/blink/renderer/platform/BUILD.gn +++ b/third_party/blink/renderer/platform/BUILD.gn @@ -118,10 +118,7 @@ blink_python_runner("color_data") { compiled_action("character_data") { tool = ":character_data_generator" - outputs = [ - "$blink_platform_output_dir/character_property_data.cc", - "$blink_platform_output_dir/text/break_iterator_data_inline_header.h", - ] + outputs = [ "$blink_platform_output_dir/character_property_data.cc" ] args = rebase_path(outputs, root_build_dir) } diff --git a/third_party/blink/renderer/platform/runtime_enabled_features.json5 b/third_party/blink/renderer/platform/runtime_enabled_features.json5 index 28d34020f7..45942f4e89 100644 --- a/third_party/blink/renderer/platform/runtime_enabled_features.json5 +++ b/third_party/blink/renderer/platform/runtime_enabled_features.json5 @@ -509,11 +509,6 @@ public: true, status: "experimental", }, - { - // crbug.com/41485013 - name: "BreakIteratorDataGenerator", - status: "stable", - }, { name: "BrowserVerifiedUserActivationKeyboard", base_feature: "none", diff --git a/third_party/blink/renderer/platform/text/character_property_data_generator.cc b/third_party/blink/renderer/platform/text/character_property_data_generator.cc index 70101dbf76..27e6852647 100644 --- a/third_party/blink/renderer/platform/text/character_property_data_generator.cc +++ b/third_party/blink/renderer/platform/text/character_property_data_generator.cc @@ -5,19 +5,13 @@ #include "third_party/blink/renderer/platform/text/character_property_data.h" #include -#include -#include #include -#include -#include #include #include #include #include #include -#include -#include #include #include @@ -31,36 +25,6 @@ namespace blink { namespace { -#define CHECK_U_ERROR(error, name) \ - CHECK(U_SUCCESS(error)) << name << ": (" << error << ")" << u_errorName(error) - -// -// Load the ICU data file and set it to the ICU. -// -void InitializeIcu(const char* exec_path) { - // ICU can't load the data file by itself because ICU tries to load the - // versioned data file (e.g., "icudt73l.dat"), while the Chromium build system - // creates the unversioned data file (e.g., "icudtl.dat"). - std::filesystem::path path{exec_path}; - path = path.parent_path() / "icudt" U_ICUDATA_TYPE_LETTER ".dat"; - - std::ifstream data_ifstream(path, std::ios_base::binary); - CHECK(data_ifstream.is_open()); - static std::vector icu_data; - CHECK(icu_data.empty()); - std::copy(std::istreambuf_iterator(data_ifstream), - std::istreambuf_iterator(), std::back_inserter(icu_data)); - UErrorCode error = U_ZERO_ERROR; - udata_setCommonData(icu_data.data(), &error); - CHECK_U_ERROR(error, "udata_setCommonData"); - - // Check ICU functions that need the data resources are working. - // https://unicode-org.github.io/icu/userguide/icu/design.html#icu4c-initialization-and-termination - UVersionInfo version; - ulocdata_getCLDRVersion(version, &error); - CHECK_U_ERROR(error, "ulocdata_getCLDRVersion"); -} - class CharacterPropertyValues { public: constexpr static UChar32 kMaxCodepoint = 0x10FFFF; @@ -255,201 +219,18 @@ static void GenerateCharacterPropertyData(FILE* fp) { GenerateUTrieSerialized(fp, serialized_size, serialized); } -// -// Generate a line break pair table in `break_iterator_data_inline_header.h`. -// -// See [UAX14](https://unicode.org/reports/tr14/). -// -class LineBreakData { - public: - LineBreakData() = default; - - static void Generate(FILE* fp) { - LineBreakData data; - data.FillFromIcu(); - data.FillAscii(); - data.Print(fp); - } - - private: - // Fill the pair table from the ICU BreakIterator. - void FillFromIcu() { - UErrorCode status = U_ZERO_ERROR; - const icu::Locale locale("en"); - icu::BreakIterator* break_iterator = - icu::BreakIterator::createLineInstance(locale, status); - CHECK_U_ERROR(status, "createLineInstance"); - - for (UChar ch = kMinChar; ch <= kMaxChar; ++ch) { - const icu::UnicodeString ch_str(ch); - for (UChar ch_next = kMinChar; ch_next <= kMaxChar; ++ch_next) { - const icu::UnicodeString ch_next_str(ch_next); - const icu::UnicodeString str = ch_str + ch_next_str; - break_iterator->setText(str); - SetPairValue(ch, ch_next, break_iterator->isBoundary(1)); - } - } - } - - // Line breaking table for printable ASCII characters. Line breaking - // opportunities in this table are as below: - // - before opening punctuations such as '(', '<', '[', '{' after certain - // characters (compatible with Firefox 3.6); - // - after '-' and '?' (backward-compatible, and compatible with Internet - // Explorer). - // Please refer to for line - // breaking matrixes of different browsers and the ICU standard. - void FillAscii() { -#define ALL_CHAR '!', 0x7F - SetPairValue(ALL_CHAR, ALL_CHAR, false); - SetPairValue(ALL_CHAR, '(', '(', true); - SetPairValue(ALL_CHAR, '<', '<', true); - SetPairValue(ALL_CHAR, '[', '[', true); - SetPairValue(ALL_CHAR, '{', '{', true); - SetPairValue('-', '-', ALL_CHAR, true); - SetPairValue('?', '?', ALL_CHAR, true); - SetPairValue('-', '-', '$', '$', false); - SetPairValue(ALL_CHAR, '!', '!', false); - SetPairValue('?', '?', '"', '"', false); - SetPairValue('?', '?', '\'', '\'', false); - SetPairValue(ALL_CHAR, ')', ')', false); - SetPairValue(ALL_CHAR, ',', ',', false); - SetPairValue(ALL_CHAR, '.', '.', false); - SetPairValue(ALL_CHAR, '/', '/', false); - // Note: Between '-' and '[0-9]' is hard-coded in `ShouldBreakFast()`. - SetPairValue('-', '-', '0', '9', false); - SetPairValue(ALL_CHAR, ':', ':', false); - SetPairValue(ALL_CHAR, ';', ';', false); - SetPairValue(ALL_CHAR, '?', '?', false); - SetPairValue(ALL_CHAR, ']', ']', false); - SetPairValue(ALL_CHAR, '}', '}', false); - SetPairValue('$', '$', ALL_CHAR, false); - SetPairValue('\'', '\'', ALL_CHAR, false); - SetPairValue('(', '(', ALL_CHAR, false); - SetPairValue('/', '/', ALL_CHAR, false); - SetPairValue('0', '9', ALL_CHAR, false); - SetPairValue('<', '<', ALL_CHAR, false); - SetPairValue('@', '@', ALL_CHAR, false); - SetPairValue('A', 'Z', ALL_CHAR, false); - SetPairValue('[', '[', ALL_CHAR, false); - SetPairValue('^', '`', ALL_CHAR, false); - SetPairValue('a', 'z', ALL_CHAR, false); - SetPairValue('{', '{', ALL_CHAR, false); - SetPairValue(0x7F, 0x7F, ALL_CHAR, false); -#undef ALL_CHAR - } - - // Print the C++ source code. - void Print(FILE* fp) { - // Print file headers. - fprintf(fp, - "#include \n" - "#include " - "\"third_party/blink/renderer/platform/wtf/text/wtf_uchar.h\"\n" - "\nnamespace {\n\n"); - - fprintf(fp, "constexpr UChar kFastLineBreakMinChar = 0x%02X;\n", kMinChar); - fprintf(fp, "constexpr UChar kFastLineBreakMaxChar = 0x%02X;\n", kMaxChar); - - // Define macros. - fprintf(fp, - "\n#define B(a, b, c, d, e, f, g, h)" - " ((a) | ((b) << 1) | ((c) << 2) | ((d) << 3) |" - " ((e) << 4) | ((f) << 5) | ((g) << 6) | ((h) << 7))\n\n"); - - fprintf(fp, "const uint8_t kFastLineBreakTable[%d][%d] = {\n", kNumChars, - kNumCharsRoundUp8 / 8); - - // Print the column comment. - fprintf(fp, " /*"); - for (UChar ch = kMinChar; ch <= kMaxChar; ++ch) { - if (ch != kMinChar && (ch - kMinChar) % 8 == 0) { - fprintf(fp, " "); - } - fprintf(fp, ch < 0x7F ? " %c" : "%02X", ch); - } - fprintf(fp, " */\n"); - - // Print the data array. - for (int y = 0; y < kNumChars; ++y) { - const UChar ch = y + kMinChar; - fprintf(fp, "/* %02X %c */ {B(", ch, ch < 0x7F ? ch : ' '); - const char* prefix = ""; - for (int x = 0; x < kNumCharsRoundUp8; ++x) { - fprintf(fp, "%s%d", prefix, pair_[y][x]); - prefix = (x % 8 == 7) ? "),B(" : ","; - } - fprintf(fp, ")},\n"); - } - fprintf(fp, - "};\n\n" - "#undef B\n\n" - "template \n" - "inline uint8_t GetFastLineBreak(T ch1, T ch2) {\n" - " const T i2 = ch2 - kFastLineBreakMinChar;\n" - " return kFastLineBreakTable[ch1 - kFastLineBreakMinChar]" - "[i2 / 8] & (1 << (i2 %% 8));\n" - "}\n\n" - "} // namespace\n"); - } - - void SetPairValue(UChar ch1_min, - UChar ch1_max, - UChar ch2_min, - UChar ch2_max, - bool value) { - for (UChar ch1 = ch1_min; ch1 <= ch1_max; ++ch1) { - for (UChar ch2 = ch2_min; ch2 <= ch2_max; ++ch2) { - SetPairValue(ch1, ch2, value); - } - } - } - - // Set the breakability between `ch1` and `ch2`. - void SetPairValue(UChar ch1, UChar ch2, bool value) { - CHECK_GE(ch1, kMinChar); - CHECK_LE(ch1, kMaxChar); - CHECK_GE(ch2, kMinChar); - CHECK_LE(ch2, kMaxChar); - pair_[ch1 - kMinChar][ch2 - kMinChar] = value; - } - - constexpr static UChar kMinChar = '!'; - constexpr static UChar kMaxChar = 0xFF; - constexpr static int kNumChars = kMaxChar - kMinChar + 1; - constexpr static int kNumCharsRoundUp8 = (kNumChars + 7) / 8 * 8; - bool pair_[kNumChars][kNumCharsRoundUp8]{}; -}; - -void InvokeGenerator(int index, - int argc, - char** argv, - void (*generator)(FILE*)) { - if (index >= argc) { - return; - } - const char* path = argv[index]; - if (!*path) { - return; - } - - if (strcmp(path, "-") == 0) { - (*generator)(stdout); - return; - } - - FILE* fp = fopen(path, "wb"); - (*generator)(fp); - fclose(fp); -} - } // namespace } // namespace blink int main(int argc, char** argv) { - blink::InitializeIcu(argv[0]); - blink::InvokeGenerator(1, argc, argv, blink::GenerateCharacterPropertyData); - blink::InvokeGenerator(2, argc, argv, blink::LineBreakData::Generate); + // Write the serialized array to the source file. + if (argc <= 1) { + blink::GenerateCharacterPropertyData(stdout); + } else { + FILE* fp = fopen(argv[1], "wb"); + blink::GenerateCharacterPropertyData(fp); + fclose(fp); + } return 0; } diff --git a/third_party/blink/renderer/platform/text/text_break_iterator.cc b/third_party/blink/renderer/platform/text/text_break_iterator.cc index 650eb760b2..14de47d9fe 100644 --- a/third_party/blink/renderer/platform/text/text_break_iterator.cc +++ b/third_party/blink/renderer/platform/text/text_break_iterator.cc @@ -26,8 +26,6 @@ #include #include -#include "third_party/blink/renderer/platform/runtime_enabled_features.h" -#include "third_party/blink/renderer/platform/text/break_iterator_data_inline_header.h" #include "third_party/blink/renderer/platform/wtf/std_lib_extras.h" #include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h" #include "third_party/blink/renderer/platform/wtf/text/character_names.h" @@ -113,9 +111,6 @@ static const UChar kAsciiLineBreakTableLastChar = 127; #define F 0xFF -// Check if the generated table match the `kAsciiLineBreakTable` table. -#define CHECK_ASCII_LINE_BRAEK_TABLE 0 - // Line breaking table for printable ASCII characters. Line breaking // opportunities in this table are as below: // - before opening punctuations such as '(', '<', '[', '{' after certain @@ -166,24 +161,6 @@ static const unsigned char kAsciiLineBreakTable[][(kAsciiLineBreakTableLastChar }; // clang-format on -#if CHECK_ASCII_LINE_BRAEK_TABLE -void CheckAsciiLineBreakTable() { - for (UChar ch2 = kAsciiLineBreakTableFirstChar; - ch2 <= kAsciiLineBreakTableLastChar; ++ch2) { - for (UChar ch1 = kAsciiLineBreakTableFirstChar; - ch1 <= kAsciiLineBreakTableLastChar; ++ch1) { - const UChar i2 = ch2 - kAsciiLineBreakTableFirstChar; - const bool ascii = - kAsciiLineBreakTable[ch1 - kAsciiLineBreakTableFirstChar][i2 / 8] & - (1 << (i2 % 8)); - const bool fast = GetFastLineBreak(ch1, ch2); - CHECK_EQ(ascii, fast) - << String::Format("%02X/%02X (%c/%c)", ch1, ch2, ch1, ch2); - } - } -} -#endif // CHECK_ASCII_LINE_BRAEK_TABLE - #define BA_LB_COUNT U_LB_COUNT // Line breaking table for CSS word-break: break-all. This table differs from // asciiLineBreakTable in: @@ -298,13 +275,7 @@ static inline bool ShouldKeepAfterKeepAll(UChar last_ch, } inline bool NeedsLineBreakIterator(UChar ch) { - if (UNLIKELY(!RuntimeEnabledFeatures::BreakIteratorDataGeneratorEnabled())) { - return ch > kAsciiLineBreakTableLastChar && ch != kNoBreakSpaceCharacter; - } - static_assert(kFastLineBreakMaxChar >= kAsciiLineBreakTableLastChar); - static_assert(kNoBreakSpaceCharacter <= kFastLineBreakMaxChar, - "Include NBSP for the performance."); - return ch > kFastLineBreakMaxChar; + return ch > kAsciiLineBreakTableLastChar && ch != kNoBreakSpaceCharacter; } template @@ -349,61 +320,31 @@ struct LazyLineBreakIterator::Context { last = current; } - bool ShouldBreakFast(bool disable_soft_hyphen) const { -#if CHECK_ASCII_LINE_BRAEK_TABLE - DEFINE_STATIC_LOCAL(bool, is_check_done, (false)); - if (!is_check_done) { - is_check_done = true; - CheckAsciiLineBreakTable(); - LOG(INFO) << "CheckAsciiLineBreakTable() completed."; - } -#endif // CHECK_ASCII_LINE_BRAEK_TABLE - + bool ShouldBreakFast() const { const UChar last_ch = last.ch; const UChar ch = current.ch; - static_assert(kFastLineBreakMinChar == kAsciiLineBreakTableFirstChar); - if (UNLIKELY(last_ch < kFastLineBreakMinChar || - ch < kFastLineBreakMinChar)) { + if (UNLIKELY(last_ch < kAsciiLineBreakTableFirstChar || + ch < kAsciiLineBreakTableFirstChar)) { return false; } // Don't allow line breaking between '-' and a digit if the '-' may mean a // minus sign in the context, while allow breaking in 'ABCD-1234' and // '1234-5678' which may be in long URLs. - static_assert('-' >= kFastLineBreakMinChar); + static_assert('-' >= kAsciiLineBreakTableFirstChar); if (last_ch == '-' && IsASCIIDigit(ch)) { return IsASCIIAlphanumeric(last_last_ch); } - if (UNLIKELY( - !RuntimeEnabledFeatures::BreakIteratorDataGeneratorEnabled())) { - // If both `last_ch` and `ch` are ASCII characters, use a lookup table for - // enhanced speed and for compatibility with other browsers (see comments - // for asciiLineBreakTable for details). - if (last_ch <= kAsciiLineBreakTableLastChar && - ch <= kAsciiLineBreakTableLastChar) { - const unsigned char* table_row = - kAsciiLineBreakTable[last_ch - kAsciiLineBreakTableFirstChar]; - int ch_index = ch - kAsciiLineBreakTableFirstChar; - return table_row[ch_index / 8] & (1 << (ch_index % 8)); - } - - // Otherwise defer to the Unicode algorithm by returning false. - return false; - } - - // If both characters are in the fast line break table, use it for enhanced - // speed. For ASCII characters, it is also for compatibility. The table is - // generated at the build time, see the `LineBreakData` class. - if (last_ch <= kFastLineBreakMaxChar && ch <= kFastLineBreakMaxChar) { - if (!GetFastLineBreak(last_ch, ch)) { - return false; - } - static_assert(kSoftHyphenCharacter <= kFastLineBreakMaxChar); - if (UNLIKELY(disable_soft_hyphen && last_ch == kSoftHyphenCharacter)) { - return false; - } - return true; + // If both `last_ch` and `ch` are ASCII characters, use a lookup table for + // enhanced speed and for compatibility with other browsers (see comments + // for asciiLineBreakTable for details). + if (last_ch <= kAsciiLineBreakTableLastChar && + ch <= kAsciiLineBreakTableLastChar) { + const unsigned char* table_row = + kAsciiLineBreakTable[last_ch - kAsciiLineBreakTableFirstChar]; + int ch_index = ch - kAsciiLineBreakTableFirstChar; + return table_row[ch_index / 8] & (1 << (ch_index % 8)); } // Otherwise defer to the Unicode algorithm by returning false. @@ -452,7 +393,7 @@ inline int LazyLineBreakIterator::NextBreakablePosition( break; } - if (context.ShouldBreakFast(disable_soft_hyphen_)) { + if (context.ShouldBreakFast()) { return i; }