| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| // |
| // regexst.h |
| // |
| // Copyright (C) 2004-2015, International Business Machines Corporation and others. |
| // All Rights Reserved. |
| // |
| // This file contains class RegexStaticSets |
| // |
| // This class is internal to the regular expression implementation. |
| // For the public Regular Expression API, see the file "unicode/regex.h" |
| // |
| // RegexStaticSets groups together the common UnicodeSets that are needed |
| // for compiling or executing RegularExpressions. This grouping simplifies |
| // the thread safe lazy creation and sharing of these sets across |
| // all instances of regular expressions. |
| // |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| |
| #include "unicode/unistr.h" |
| #include "unicode/uniset.h" |
| #include "unicode/uchar.h" |
| #include "unicode/regex.h" |
| #include "uprops.h" |
| #include "cmemory.h" |
| #include "cstring.h" |
| #include "uassert.h" |
| #include "ucln_in.h" |
| #include "umutex.h" |
| |
| #include "regexcst.h" // Contains state table for the regex pattern parser. |
| // generated by a Perl script. |
| #include "regexst.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| // "Rule Char" Characters are those with special meaning, and therefore |
| // need to be escaped to appear as literals in a regexp. |
| constexpr char16_t const *gRuleSet_rule_chars = u"*?+[(){}^$|\\."; |
| |
| // |
| // The backslash escape characters that ICU's unescape() function will handle. |
| // |
| constexpr char16_t const *gUnescapeChars = u"acefnrtuUx"; |
| |
| // |
| // Unicode Set pattern for Regular Expression \w |
| // |
| constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]"; |
| |
| // |
| // Unicode Set Definitions for Regular Expression \s |
| // |
| constexpr char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]"; |
| |
| // |
| // UnicodeSets used in implementation of Grapheme Cluster detection, \X |
| // |
| constexpr char16_t const *gGC_ControlPattern = u"[[:Zl:][:Zp:][:Cc:][:Cf:]-[:Grapheme_Extend:]]"; |
| constexpr char16_t const *gGC_ExtendPattern = u"[\\p{Grapheme_Extend}]"; |
| constexpr char16_t const *gGC_LPattern = u"[\\p{Hangul_Syllable_Type=L}]"; |
| constexpr char16_t const *gGC_VPattern = u"[\\p{Hangul_Syllable_Type=V}]"; |
| constexpr char16_t const *gGC_TPattern = u"[\\p{Hangul_Syllable_Type=T}]"; |
| constexpr char16_t const *gGC_LVPattern = u"[\\p{Hangul_Syllable_Type=LV}]"; |
| constexpr char16_t const *gGC_LVTPattern = u"[\\p{Hangul_Syllable_Type=LVT}]"; |
| |
| |
| RegexStaticSets *RegexStaticSets::gStaticSets = nullptr; |
| UInitOnce gStaticSetsInitOnce = U_INITONCE_INITIALIZER; |
| |
| |
| RegexStaticSets::RegexStaticSets(UErrorCode *status) { |
| // Initialize the shared static sets to their correct values. |
| fUnescapeCharSet.addAll(UnicodeString(true, gUnescapeChars, -1)).freeze(); |
| fPropSets[URX_ISWORD_SET].applyPattern(UnicodeString(true, gIsWordPattern, -1), *status).freeze(); |
| fPropSets[URX_ISSPACE_SET].applyPattern(UnicodeString(true, gIsSpacePattern, -1), *status).freeze(); |
| fPropSets[URX_GC_EXTEND].applyPattern(UnicodeString(TRUE, gGC_ExtendPattern, -1), *status).freeze(); |
| fPropSets[URX_GC_CONTROL].applyPattern(UnicodeString(TRUE, gGC_ControlPattern, -1), *status).freeze(); |
| fPropSets[URX_GC_L].applyPattern(UnicodeString(TRUE, gGC_LPattern, -1), *status).freeze(); |
| fPropSets[URX_GC_V].applyPattern(UnicodeString(TRUE, gGC_VPattern, -1), *status).freeze(); |
| fPropSets[URX_GC_T].applyPattern(UnicodeString(TRUE, gGC_TPattern, -1), *status).freeze(); |
| fPropSets[URX_GC_LV].applyPattern(UnicodeString(TRUE, gGC_LVPattern, -1), *status).freeze(); |
| fPropSets[URX_GC_LVT].applyPattern(UnicodeString(TRUE, gGC_LVTPattern, -1), *status).freeze(); |
| |
| |
| // |
| // "Normal" is the set of characters that don't need special handling |
| // when finding grapheme cluster boundaries. |
| // |
| fPropSets[URX_GC_NORMAL].complement(); |
| fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4); |
| fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]); |
| fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]); |
| fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]); |
| fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]); |
| fPropSets[URX_GC_NORMAL].freeze(); |
| |
| // Initialize the 8-bit fast bit sets from the parallel full |
| // UnicodeSets. |
| // |
| // TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping? |
| // Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x" |
| // This runs in exponential time, making it easy to adjust the time for |
| // convenient measuring. |
| // |
| // This 8 bit optimization dates from the early days of ICU, |
| // with a less optimized UnicodeSet. At the time, the difference |
| // was substantial. |
| |
| for (int32_t i=0; i<URX_LAST_SET; i++) { |
| fPropSets8[i].init(&fPropSets[i]); |
| } |
| |
| // Sets used while parsing rules, but not referenced from the parse state table |
| fRuleSets[kRuleSet_rule_char-128] |
| .addAll(UnicodeString(gRuleSet_rule_chars)).complement().freeze(); |
| |
| fRuleSets[kRuleSet_digit_char-128].add(u'0', u'9').freeze(); |
| fRuleSets[kRuleSet_ascii_letter-128].add(u'A', u'Z').add(u'a', u'z').freeze(); |
| fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128]; |
| |
| // Finally, initialize an empty UText string for utility purposes |
| fEmptyText = utext_openUChars(nullptr, nullptr, 0, status); |
| |
| } |
| |
| |
| RegexStaticSets::~RegexStaticSets() { |
| fRuleDigitsAlias = nullptr; |
| utext_close(fEmptyText); |
| } |
| |
| |
| //------------------------------------------------------------------------------ |
| // |
| // regex_cleanup Memory cleanup function, free/delete all |
| // cached memory. Called by ICU's u_cleanup() function. |
| // |
| //------------------------------------------------------------------------------ |
| |
| U_CDECL_BEGIN |
| static UBool U_CALLCONV |
| regex_cleanup(void) { |
| delete RegexStaticSets::gStaticSets; |
| RegexStaticSets::gStaticSets = nullptr; |
| gStaticSetsInitOnce.reset(); |
| return TRUE; |
| } |
| |
| static void U_CALLCONV initStaticSets(UErrorCode &status) { |
| U_ASSERT(RegexStaticSets::gStaticSets == nullptr); |
| ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup); |
| RegexStaticSets::gStaticSets = new RegexStaticSets(&status); |
| if (U_FAILURE(status)) { |
| delete RegexStaticSets::gStaticSets; |
| RegexStaticSets::gStaticSets = nullptr; |
| } |
| if (RegexStaticSets::gStaticSets == nullptr && U_SUCCESS(status)) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| } |
| U_CDECL_END |
| |
| void RegexStaticSets::initGlobals(UErrorCode *status) { |
| umtx_initOnce(gStaticSetsInitOnce, &initStaticSets, *status); |
| } |
| |
| U_NAMESPACE_END |
| #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |