icu4c/source/i18n/regexst.cpp - external/github.com/unicode-org/icu - Git at Google

 // © 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html
 //
 //  regexst.h
 //
 //  Copyright (C) 2004-2015, International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
 //  This file contains class RegexStaticSets
 //
 //  This class is internal to the regular expression implementation.
 //  For the public Regular Expression API, see the file "unicode/regex.h"
 //
 //  RegexStaticSets groups together the common UnicodeSets that are needed
 //   for compiling or executing RegularExpressions.  This grouping simplifies
 //   the thread safe lazy creation and sharing of these sets across
 //   all instances of regular expressions.
 //
 #include "unicode/utypes.h"

 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

 #include "unicode/unistr.h"
 #include "unicode/uniset.h"
 #include "unicode/uchar.h"
 #include "unicode/regex.h"
 #include "uprops.h"
 #include "cmemory.h"
 #include "cstring.h"
 #include "uassert.h"
 #include "ucln_in.h"
 #include "umutex.h"

 #include "regexcst.h"   // Contains state table for the regex pattern parser.
                         //   generated by a Perl script.
 #include "regexst.h"

 U_NAMESPACE_BEGIN

 // "Rule Char" Characters are those with special meaning, and therefore
 //    need to be escaped to appear as literals in a regexp.
 constexpr char16_t const *gRuleSet_rule_chars = u"*?+[(){}^$|\\.";

 //
 //   The backslash escape characters that ICU's unescape() function will handle.
 //
 constexpr char16_t const *gUnescapeChars = u"acefnrtuUx";

 //
 //  Unicode Set pattern for Regular Expression  \w
 //
 constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]";

 //
 //  Unicode Set Definitions for Regular Expression  \s
 //
 constexpr  char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]";

 //
 //  UnicodeSets used in implementation of Grapheme Cluster detection, \X
 //
 constexpr char16_t const *gGC_ControlPattern = u"[[:Zl:][:Zp:][:Cc:][:Cf:]-[:Grapheme_Extend:]]";
 constexpr char16_t const *gGC_ExtendPattern  = u"[\\p{Grapheme_Extend}]";
 constexpr char16_t const *gGC_LPattern       = u"[\\p{Hangul_Syllable_Type=L}]";
 constexpr char16_t const *gGC_VPattern       = u"[\\p{Hangul_Syllable_Type=V}]";
 constexpr char16_t const *gGC_TPattern       = u"[\\p{Hangul_Syllable_Type=T}]";
 constexpr char16_t const *gGC_LVPattern      = u"[\\p{Hangul_Syllable_Type=LV}]";
 constexpr char16_t const *gGC_LVTPattern     = u"[\\p{Hangul_Syllable_Type=LVT}]";


 RegexStaticSets *RegexStaticSets::gStaticSets = nullptr;
 UInitOnce gStaticSetsInitOnce = U_INITONCE_INITIALIZER;


 RegexStaticSets::RegexStaticSets(UErrorCode *status) {
     // Initialize the shared static sets to their correct values.
     fUnescapeCharSet.addAll(UnicodeString(true, gUnescapeChars, -1)).freeze();
     fPropSets[URX_ISWORD_SET].applyPattern(UnicodeString(true, gIsWordPattern, -1), *status).freeze();
     fPropSets[URX_ISSPACE_SET].applyPattern(UnicodeString(true, gIsSpacePattern, -1), *status).freeze();
     fPropSets[URX_GC_EXTEND].applyPattern(UnicodeString(TRUE, gGC_ExtendPattern, -1), *status).freeze();
     fPropSets[URX_GC_CONTROL].applyPattern(UnicodeString(TRUE, gGC_ControlPattern, -1), *status).freeze();
     fPropSets[URX_GC_L].applyPattern(UnicodeString(TRUE, gGC_LPattern, -1), *status).freeze();
     fPropSets[URX_GC_V].applyPattern(UnicodeString(TRUE, gGC_VPattern, -1), *status).freeze();
     fPropSets[URX_GC_T].applyPattern(UnicodeString(TRUE, gGC_TPattern, -1), *status).freeze();
     fPropSets[URX_GC_LV].applyPattern(UnicodeString(TRUE, gGC_LVPattern, -1), *status).freeze();
     fPropSets[URX_GC_LVT].applyPattern(UnicodeString(TRUE, gGC_LVTPattern, -1), *status).freeze();


     //
     //  "Normal" is the set of characters that don't need special handling
     //            when finding grapheme cluster boundaries.
     //
     fPropSets[URX_GC_NORMAL].complement();
     fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4);
     fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]);
     fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]);
     fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]);
     fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]);
     fPropSets[URX_GC_NORMAL].freeze();

     // Initialize the 8-bit fast bit sets from the parallel full
     //   UnicodeSets.
     //
     // TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping?
     //       Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x"
     //       This runs in exponential time, making it easy to adjust the time for
     //       convenient measuring.
     //
     //       This 8 bit optimization dates from the early days of ICU,
     //       with a less optimized UnicodeSet. At the time, the difference
     //       was substantial.

     for (int32_t i=0; i<URX_LAST_SET; i++) {
         fPropSets8[i].init(&fPropSets[i]);
     }

     // Sets used while parsing rules, but not referenced from the parse state table
     fRuleSets[kRuleSet_rule_char-128]
             .addAll(UnicodeString(gRuleSet_rule_chars)).complement().freeze();

     fRuleSets[kRuleSet_digit_char-128].add(u'0', u'9').freeze();
     fRuleSets[kRuleSet_ascii_letter-128].add(u'A', u'Z').add(u'a', u'z').freeze();
     fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128];

     // Finally, initialize an empty UText string for utility purposes
     fEmptyText = utext_openUChars(nullptr, nullptr, 0, status);

 }


 RegexStaticSets::~RegexStaticSets() {
     fRuleDigitsAlias = nullptr;
     utext_close(fEmptyText);
 }


 //------------------------------------------------------------------------------
 //
 //   regex_cleanup      Memory cleanup function, free/delete all
 //                      cached memory.  Called by ICU's u_cleanup() function.
 //
 //------------------------------------------------------------------------------

 U_CDECL_BEGIN
 static UBool U_CALLCONV
 regex_cleanup(void) {
     delete RegexStaticSets::gStaticSets;
     RegexStaticSets::gStaticSets = nullptr;
     gStaticSetsInitOnce.reset();
     return TRUE;
 }

 static void U_CALLCONV initStaticSets(UErrorCode &status) {
     U_ASSERT(RegexStaticSets::gStaticSets == nullptr);
     ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup);
     RegexStaticSets::gStaticSets = new RegexStaticSets(&status);
     if (U_FAILURE(status)) {
         delete RegexStaticSets::gStaticSets;
         RegexStaticSets::gStaticSets = nullptr;
     }
     if (RegexStaticSets::gStaticSets == nullptr && U_SUCCESS(status)) {
         status = U_MEMORY_ALLOCATION_ERROR;
     }
 }
 U_CDECL_END

 void RegexStaticSets::initGlobals(UErrorCode *status) {
     umtx_initOnce(gStaticSetsInitOnce, &initStaticSets, *status);
 }

 U_NAMESPACE_END
 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
	// © 2016 and later: Unicode, Inc. and others.
	// License & terms of use: http://www.unicode.org/copyright.html
	//
	// regexst.h
	//
	// Copyright (C) 2004-2015, International Business Machines Corporation and others.
	// All Rights Reserved.
	//
	// This file contains class RegexStaticSets
	//
	// This class is internal to the regular expression implementation.
	// For the public Regular Expression API, see the file "unicode/regex.h"
	//
	// RegexStaticSets groups together the common UnicodeSets that are needed
	// for compiling or executing RegularExpressions. This grouping simplifies
	// the thread safe lazy creation and sharing of these sets across
	// all instances of regular expressions.
	//
	#include "unicode/utypes.h"

	#if !UCONFIG_NO_REGULAR_EXPRESSIONS

	#include "unicode/unistr.h"
	#include "unicode/uniset.h"
	#include "unicode/uchar.h"
	#include "unicode/regex.h"
	#include "uprops.h"
	#include "cmemory.h"
	#include "cstring.h"
	#include "uassert.h"
	#include "ucln_in.h"
	#include "umutex.h"

	#include "regexcst.h" // Contains state table for the regex pattern parser.
	// generated by a Perl script.
	#include "regexst.h"

	U_NAMESPACE_BEGIN

	// "Rule Char" Characters are those with special meaning, and therefore
	// need to be escaped to appear as literals in a regexp.
	constexpr char16_t const gRuleSet_rule_chars = u"?+[(){}^$\|\\.";

	//
	// The backslash escape characters that ICU's unescape() function will handle.
	//
	constexpr char16_t const *gUnescapeChars = u"acefnrtuUx";

	//
	// Unicode Set pattern for Regular Expression \w
	//
	constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]";

	//
	// Unicode Set Definitions for Regular Expression \s
	//
	constexpr char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]";

	//
	// UnicodeSets used in implementation of Grapheme Cluster detection, \X
	//
	constexpr char16_t const *gGC_ControlPattern = u"[[:Zl:][:Zp:][:Cc:][:Cf:]-[:Grapheme_Extend:]]";
	constexpr char16_t const *gGC_ExtendPattern = u"[\\p{Grapheme_Extend}]";
	constexpr char16_t const *gGC_LPattern = u"[\\p{Hangul_Syllable_Type=L}]";
	constexpr char16_t const *gGC_VPattern = u"[\\p{Hangul_Syllable_Type=V}]";
	constexpr char16_t const *gGC_TPattern = u"[\\p{Hangul_Syllable_Type=T}]";
	constexpr char16_t const *gGC_LVPattern = u"[\\p{Hangul_Syllable_Type=LV}]";
	constexpr char16_t const *gGC_LVTPattern = u"[\\p{Hangul_Syllable_Type=LVT}]";


	RegexStaticSets *RegexStaticSets::gStaticSets = nullptr;
	UInitOnce gStaticSetsInitOnce = U_INITONCE_INITIALIZER;


	RegexStaticSets::RegexStaticSets(UErrorCode *status) {
	// Initialize the shared static sets to their correct values.
	fUnescapeCharSet.addAll(UnicodeString(true, gUnescapeChars, -1)).freeze();
	fPropSets[URX_ISWORD_SET].applyPattern(UnicodeString(true, gIsWordPattern, -1), *status).freeze();
	fPropSets[URX_ISSPACE_SET].applyPattern(UnicodeString(true, gIsSpacePattern, -1), *status).freeze();
	fPropSets[URX_GC_EXTEND].applyPattern(UnicodeString(TRUE, gGC_ExtendPattern, -1), *status).freeze();
	fPropSets[URX_GC_CONTROL].applyPattern(UnicodeString(TRUE, gGC_ControlPattern, -1), *status).freeze();
	fPropSets[URX_GC_L].applyPattern(UnicodeString(TRUE, gGC_LPattern, -1), *status).freeze();
	fPropSets[URX_GC_V].applyPattern(UnicodeString(TRUE, gGC_VPattern, -1), *status).freeze();
	fPropSets[URX_GC_T].applyPattern(UnicodeString(TRUE, gGC_TPattern, -1), *status).freeze();
	fPropSets[URX_GC_LV].applyPattern(UnicodeString(TRUE, gGC_LVPattern, -1), *status).freeze();
	fPropSets[URX_GC_LVT].applyPattern(UnicodeString(TRUE, gGC_LVTPattern, -1), *status).freeze();


	//
	// "Normal" is the set of characters that don't need special handling
	// when finding grapheme cluster boundaries.
	//
	fPropSets[URX_GC_NORMAL].complement();
	fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4);
	fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]);
	fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]);
	fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]);
	fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]);
	fPropSets[URX_GC_NORMAL].freeze();

	// Initialize the 8-bit fast bit sets from the parallel full
	// UnicodeSets.
	//
	// TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping?
	// Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x"
	// This runs in exponential time, making it easy to adjust the time for
	// convenient measuring.
	//
	// This 8 bit optimization dates from the early days of ICU,
	// with a less optimized UnicodeSet. At the time, the difference
	// was substantial.

	for (int32_t i=0; i<URX_LAST_SET; i++) {
	fPropSets8[i].init(&fPropSets[i]);
	}

	// Sets used while parsing rules, but not referenced from the parse state table
	fRuleSets[kRuleSet_rule_char-128]
	.addAll(UnicodeString(gRuleSet_rule_chars)).complement().freeze();

	fRuleSets[kRuleSet_digit_char-128].add(u'0', u'9').freeze();
	fRuleSets[kRuleSet_ascii_letter-128].add(u'A', u'Z').add(u'a', u'z').freeze();
	fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128];

	// Finally, initialize an empty UText string for utility purposes
	fEmptyText = utext_openUChars(nullptr, nullptr, 0, status);

	}


	RegexStaticSets::~RegexStaticSets() {
	fRuleDigitsAlias = nullptr;
	utext_close(fEmptyText);
	}


	//------------------------------------------------------------------------------
	//
	// regex_cleanup Memory cleanup function, free/delete all
	// cached memory. Called by ICU's u_cleanup() function.
	//
	//------------------------------------------------------------------------------

	U_CDECL_BEGIN
	static UBool U_CALLCONV
	regex_cleanup(void) {
	delete RegexStaticSets::gStaticSets;
	RegexStaticSets::gStaticSets = nullptr;
	gStaticSetsInitOnce.reset();
	return TRUE;
	}

	static void U_CALLCONV initStaticSets(UErrorCode &status) {
	U_ASSERT(RegexStaticSets::gStaticSets == nullptr);
	ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup);
	RegexStaticSets::gStaticSets = new RegexStaticSets(&status);
	if (U_FAILURE(status)) {
	delete RegexStaticSets::gStaticSets;
	RegexStaticSets::gStaticSets = nullptr;
	}
	if (RegexStaticSets::gStaticSets == nullptr && U_SUCCESS(status)) {
	status = U_MEMORY_ALLOCATION_ERROR;
	}
	}
	U_CDECL_END

	void RegexStaticSets::initGlobals(UErrorCode *status) {
	umtx_initOnce(gStaticSetsInitOnce, &initStaticSets, *status);
	}

	U_NAMESPACE_END
	#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS