ICU-12549 Updating SpoofChecker to latest Unicode specification.
X-SVN-Rev: 39218
diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h
index 1d8c893..fdd7fc3 100644
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@@ -362,7 +362,7 @@
UnicodeSet();
/**
- * Constructs a set containing the given range. If <code>end >
+ * Constructs a set containing the given range. If <code>end <
* start</code> then an empty set is created.
*
* @param start first character, inclusive, of range
diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in
index aab5514..2fe9b3d 100644
--- a/icu4c/source/i18n/Makefile.in
+++ b/icu4c/source/i18n/Makefile.in
@@ -92,10 +92,10 @@
wintzimpl.o windtfmt.o winnmfmt.o basictz.o dtrule.o rbtz.o tzrule.o tztrans.o vtzone.o zonemeta.o \
standardplural.o upluralrules.o plurrule.o plurfmt.o selfmt.o dtitvfmt.o dtitvinf.o udateintervalformat.o \
tmunit.o tmutamt.o tmutfmt.o currpinf.o \
-uspoof.o uspoof_impl.o uspoof_build.o uspoof_conf.o uspoof_wsconf.o decfmtst.o smpdtfst.o \
+uspoof.o uspoof_impl.o uspoof_build.o uspoof_conf.o decfmtst.o smpdtfst.o \
ztrans.o zrule.o vzone.o fphdlimp.o fpositer.o ufieldpositer.o \
decNumber.o decContext.o alphaindex.o tznames.o tznames_impl.o tzgnames.o \
-tzfmt.o compactdecimalformat.o gender.o region.o scriptset.o identifier_info.o \
+tzfmt.o compactdecimalformat.o gender.o region.o scriptset.o \
uregion.o reldatefmt.o quantityformatter.o measunit.o \
sharedbreakiterator.o scientificnumberformatter.o digitgrouping.o \
digitinterval.o digitformatter.o digitaffix.o valueformatter.o \
diff --git a/icu4c/source/i18n/i18n.vcxproj b/icu4c/source/i18n/i18n.vcxproj
index 0d0cd6d..869d3a9 100644
--- a/icu4c/source/i18n/i18n.vcxproj
+++ b/icu4c/source/i18n/i18n.vcxproj
@@ -337,7 +337,6 @@
<ClCompile Include="gregocal.cpp" />
<ClCompile Include="gregoimp.cpp" />
<ClCompile Include="hebrwcal.cpp" />
- <ClCompile Include="identifier_info.cpp" />
<ClCompile Include="indiancal.cpp" />
<ClCompile Include="islamcal.cpp" />
<ClCompile Include="japancal.cpp" />
@@ -464,7 +463,6 @@
<ClCompile Include="uspoof_build.cpp" />
<ClCompile Include="uspoof_conf.cpp" />
<ClCompile Include="uspoof_impl.cpp" />
- <ClCompile Include="uspoof_wsconf.cpp" />
</ItemGroup>
<ItemGroup>
<CustomBuild Include="unicode\alphaindex.h">
@@ -1686,11 +1684,9 @@
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
</CustomBuild>
- <ClInclude Include="identifier_info.h" />
<ClInclude Include="scriptset.h" />
<ClInclude Include="uspoof_conf.h" />
<ClInclude Include="uspoof_impl.h" />
- <ClInclude Include="uspoof_wsconf.h" />
</ItemGroup>
<ItemGroup>
<ResourceCompile Include="i18n.rc" />
diff --git a/icu4c/source/i18n/i18n.vcxproj.filters b/icu4c/source/i18n/i18n.vcxproj.filters
index c897484..57bf52f 100644
--- a/icu4c/source/i18n/i18n.vcxproj.filters
+++ b/icu4c/source/i18n/i18n.vcxproj.filters
@@ -501,9 +501,6 @@
<ClCompile Include="ucsdet.cpp">
<Filter>charset detect</Filter>
</ClCompile>
- <ClCompile Include="identifier_info.cpp">
- <Filter>spoof</Filter>
- </ClCompile>
<ClCompile Include="scriptset.cpp">
<Filter>spoof</Filter>
</ClCompile>
@@ -519,9 +516,6 @@
<ClCompile Include="uspoof_impl.cpp">
<Filter>spoof</Filter>
</ClCompile>
- <ClCompile Include="uspoof_wsconf.cpp">
- <Filter>spoof</Filter>
- </ClCompile>
<ClCompile Include="alphaindex.cpp">
<Filter>collation</Filter>
</ClCompile>
@@ -943,9 +937,6 @@
<ClInclude Include="inputext.h">
<Filter>charset detect</Filter>
</ClInclude>
- <ClInclude Include="identifier_info.h">
- <Filter>spoof</Filter>
- </ClInclude>
<ClInclude Include="scriptset.h">
<Filter>spoof</Filter>
</ClInclude>
@@ -955,9 +946,6 @@
<ClInclude Include="uspoof_impl.h">
<Filter>spoof</Filter>
</ClInclude>
- <ClInclude Include="uspoof_wsconf.h">
- <Filter>spoof</Filter>
- </ClInclude>
<ClInclude Include="tzgnames.h">
<Filter>formatting</Filter>
</ClInclude>
diff --git a/icu4c/source/i18n/scriptset.cpp b/icu4c/source/i18n/scriptset.cpp
index ee1ed70..5a42535 100644
--- a/icu4c/source/i18n/scriptset.cpp
+++ b/icu4c/source/i18n/scriptset.cpp
@@ -193,6 +193,15 @@
return -1;
}
+UBool ScriptSet::isEmpty() const {
+ for (uint32_t i=0; i<UPRV_LENGTHOF(bits); i++) {
+ if (bits[i] != 0) {
+ return FALSE;
+ }
+ }
+ return TRUE;
+}
+
UnicodeString &ScriptSet::displayScripts(UnicodeString &dest) const {
UBool firstTime = TRUE;
for (int32_t i = nextSetBit(0); i >= 0; i = nextSetBit(i + 1)) {
@@ -240,6 +249,41 @@
return *this;
}
+void ScriptSet::setScriptExtensions(UChar32 codePoint, UErrorCode& status) {
+ if (U_FAILURE(status)) { return; }
+ static const int32_t FIRST_GUESS_SCRIPT_CAPACITY = 5;
+ MaybeStackArray<UScriptCode,FIRST_GUESS_SCRIPT_CAPACITY> scripts;
+ UErrorCode internalStatus = U_ZERO_ERROR;
+ int32_t script_count = -1;
+
+ while (TRUE) {
+ script_count = uscript_getScriptExtensions(
+ codePoint, scripts.getAlias(), FIRST_GUESS_SCRIPT_CAPACITY, &internalStatus);
+ if (internalStatus == U_BUFFER_OVERFLOW_ERROR) {
+ // Need to allocate more space
+ if (scripts.resize(script_count) == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ internalStatus = U_ZERO_ERROR;
+ } else {
+ break;
+ }
+ }
+
+ // Check if we failed for some reason other than buffer overflow
+ if (U_FAILURE(internalStatus)) {
+ status = internalStatus;
+ return;
+ }
+
+ // Load the scripts into the ScriptSet and return
+ for (int32_t i = 0; i < script_count; i++) {
+ this->set(scripts[i], status);
+ if (U_FAILURE(status)) { return; }
+ }
+}
+
U_NAMESPACE_END
U_CAPI UBool U_EXPORT2
diff --git a/icu4c/source/i18n/scriptset.h b/icu4c/source/i18n/scriptset.h
index cd5b3be..e8de3b9 100644
--- a/icu4c/source/i18n/scriptset.h
+++ b/icu4c/source/i18n/scriptset.h
@@ -58,9 +58,14 @@
int32_t hashCode() const;
int32_t nextSetBit(int32_t script) const;
+ UBool isEmpty() const;
+
UnicodeString &displayScripts(UnicodeString &dest) const; // append script names to dest string.
ScriptSet & parseScripts(const UnicodeString &scriptsString, UErrorCode &status); // Replaces ScriptSet contents.
+ // Wraps around UScript::getScriptExtensions() and adds the corresponding scripts to this instance.
+ void setScriptExtensions(UChar32 codePoint, UErrorCode& status);
+
private:
uint32_t bits[6];
};
diff --git a/icu4c/source/i18n/ucln_in.h b/icu4c/source/i18n/ucln_in.h
index b5069e1..b609fce 100644
--- a/icu4c/source/i18n/ucln_in.h
+++ b/icu4c/source/i18n/ucln_in.h
@@ -26,8 +26,8 @@
It's usually best to have child dependencies called first. */
typedef enum ECleanupI18NType {
UCLN_I18N_START = -1,
- UCLN_I18N_IDENTIFIER_INFO,
UCLN_I18N_SPOOF,
+ UCLN_I18N_SPOOFDATA,
UCLN_I18N_TRANSLITERATOR,
UCLN_I18N_REGEX,
UCLN_I18N_ISLAMIC_CALENDAR,
diff --git a/icu4c/source/i18n/unicode/uspoof.h b/icu4c/source/i18n/unicode/uspoof.h
index 2d65fbd..08e85fc 100644
--- a/icu4c/source/i18n/unicode/uspoof.h
+++ b/icu4c/source/i18n/unicode/uspoof.h
@@ -37,123 +37,345 @@
* \file
* \brief Unicode Security and Spoofing Detection, C API.
*
- * These functions are intended to check strings, typically
- * identifiers of some type, such as URLs, for the presence of
- * characters that are likely to be visually confusing -
- * for cases where the displayed form of an identifier may
- * not be what it appears to be.
+ * <p>
+ * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and
+ * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions:
*
- * Unicode Technical Report #36, http://unicode.org/reports/tr36, and
- * Unicode Technical Standard #39, http://unicode.org/reports/tr39
- * "Unicode security considerations", give more background on
- * security an spoofing issues with Unicode identifiers.
- * The tests and checks provided by this module implement the recommendations
- * from those Unicode documents.
+ * <ol>
+ * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "desordenado" and
+ * "ԁеѕогԁепаԁо".</li>
+ * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof
+ * detection</em>), such as "pаypаl" spelled with Cyrillic 'а' characters.</li>
+ * </ol>
*
- * The tests available on identifiers fall into two general categories:
- * -# Single identifier tests. Check whether an identifier is
- * potentially confusable with any other string, or is suspicious
- * for other reasons.
- * -# Two identifier tests. Check whether two specific identifiers are confusable.
- * This does not consider whether either of strings is potentially
- * confusable with any string other than the exact one specified.
+ * <p>
+ * Although originally designed as a method for flagging suspicious identifier strings such as URLs,
+ * <code>USpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word
+ * content filters.
*
- * The steps to perform confusability testing are
- * -# Open a USpoofChecker.
- * -# Configure the USPoofChecker for the desired set of tests. The tests that will
- * be performed are specified by a set of USpoofChecks flags.
- * -# Perform the checks using the pre-configured USpoofChecker. The results indicate
- * which (if any) of the selected tests have identified possible problems with the identifier.
- * Results are reported as a set of USpoofChecks flags; this mirrors the form in which
- * the set of tests to perform was originally specified to the USpoofChecker.
+ * <p>
+ * The functions of this class are exposed as C API, with a handful of syntactical conveniences for C++.
*
- * A USpoofChecker may be used repeatedly to perform checks on any number of identifiers.
+ * <h2>Confusables</h2>
*
- * Thread Safety: The test functions for checking a single identifier, or for testing
- * whether two identifiers are possible confusable, are thread safe.
- * They may called concurrently, from multiple threads, using the same USpoofChecker instance.
+ * <p>
+ * The following example shows how to use <code>USpoofChecker</code> to check for confusability between two strings:
*
- * More generally, the standard ICU thread safety rules apply: functions that take a
- * const USpoofChecker parameter are thread safe. Those that take a non-const
- * USpoofChecier are not thread safe.
+ * \code{.c}
+ * UErrorCode status = U_ZERO_ERROR;
+ * USpoofChecker* sc = uspoof_open(&status);
+ * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
+ * int32_t bitmask = uspoof_areConfusable(sc, (UChar*) u"desordenado", -1, (UChar*) u"ԁеѕогԁепаԁо", -1, &status);
+ * UBool result = (bitmask & USPOOF_ALL_CHECKS) != 0;
+ * printf("areConfusable: %d (success: %d)\n", result, U_SUCCESS(status)); // areConfusable: 1 (success: 1)
+ * uspoof_close(sc);
+ * \endcode
*
+ * <p>
+ * The second line of the example creates a <code>USpoofChecker</code> object; the third line enables confusable
+ * checking and disables all other checks; the fourth line performs the confusability test; and the fifth line extracts
+ * the result out of the confusability test. For best performance, the instance should be created once (e.g., upon
+ * application startup), and the efficient {@link uspoof_areConfusable} method can be used at runtime.
*
- * Descriptions of the available checks.
+ * <p>
+ * The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call
+ * {@link uspoof_close} when the object goes out of scope:
*
- * When testing whether pairs of identifiers are confusable, with the uspoof_areConfusable()
- * family of functions, the relevant tests are
+ * \code{.cpp}
+ * UErrorCode status = U_ZERO_ERROR;
+ * LocalUSpoofCheckerPointer sc(uspoof_open(&status));
+ * uspoof_setChecks(sc.getAlias(), USPOOF_CONFUSABLE, &status);
+ * // ...
+ * \endcode
*
- * -# USPOOF_SINGLE_SCRIPT_CONFUSABLE: All of the characters from the two identifiers are
- * from a single script, and the two identifiers are visually confusable.
- * -# USPOOF_MIXED_SCRIPT_CONFUSABLE: At least one of the identifiers contains characters
- * from more than one script, and the two identifiers are visually confusable.
- * -# USPOOF_WHOLE_SCRIPT_CONFUSABLE: Each of the two identifiers is of a single script, but
- * the two identifiers are from different scripts, and they are visually confusable.
+ * <p>
+ * UTS 39 defines two strings to be <em>confusable</em> if they map to the same <em>skeleton string</em>. A skeleton can
+ * be thought of as a "hash code". {@link uspoof_getSkeleton} computes the skeleton for a particular string, so
+ * the following snippet is equivalent to the example above:
*
- * The safest approach is to enable all three of these checks as a group.
+ * \code{.c}
+ * UErrorCode status = U_ZERO_ERROR;
+ * UChar* str1 = (UChar*) u"desordenado";
+ * UChar* str2 = (UChar*) u"ԁеѕогԁепаԁо";
*
- * USPOOF_ANY_CASE is a modifier for the above tests. If the identifiers being checked can
- * be of mixed case and are used in a case-sensitive manner, this option should be specified.
+ * USpoofChecker* sc = uspoof_open(&status);
+ * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
*
- * If the identifiers being checked are used in a case-insensitive manner, and if they are
- * displayed to users in lower-case form only, the USPOOF_ANY_CASE option should not be
- * specified. Confusabality issues involving upper case letters will not be reported.
+ * // Get skeleton 1
+ * int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status);
+ * UChar* skel1 = (UChar*) malloc(skel1Len * sizeof(UChar));
+ * status = U_ZERO_ERROR;
+ * uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status);
*
- * When performing tests on a single identifier, with the uspoof_check() family of functions,
- * the relevant tests are:
+ * // Get skeleton 2
+ * int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status);
+ * UChar* skel2 = (UChar*) malloc(skel2Len * sizeof(UChar));
+ * status = U_ZERO_ERROR;
+ * uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status);
*
- * -# USPOOF_MIXED_SCRIPT_CONFUSABLE: the identifier contains characters from multiple
- * scripts, and there exists an identifier of a single script that is visually confusable.
- * -# USPOOF_WHOLE_SCRIPT_CONFUSABLE: the identifier consists of characters from a single
- * script, and there exists a visually confusable identifier.
- * The visually confusable identifier also consists of characters from a single script.
- * but not the same script as the identifier being checked.
- * -# USPOOF_ANY_CASE: modifies the mixed script and whole script confusables tests. If
- * specified, the checks will consider confusable characters of any case. If this flag is not
- * set, the test is performed assuming case folded identifiers.
- * -# USPOOF_SINGLE_SCRIPT: check that the identifier contains only characters from a
- * single script. (Characters from the 'common' and 'inherited' scripts are ignored.)
- * This is not a test for confusable identifiers
- * -# USPOOF_INVISIBLE: check an identifier for the presence of invisible characters,
- * such as zero-width spaces, or character sequences that are
- * likely not to display, such as multiple occurrences of the same
- * non-spacing mark. This check does not test the input string as a whole
- * for conformance to any particular syntax for identifiers.
- * -# USPOOF_CHAR_LIMIT: check that an identifier contains only characters from a specified set
- * of acceptable characters. See uspoof_setAllowedChars() and
- * uspoof_setAllowedLocales().
+ * // Are the skeletons the same?
+ * UBool result = (skel1Len == skel2Len) && memcmp(skel1, skel2, skel1Len) == 0;
+ * printf("areConfusable: %d (success: %d)\n", result, U_SUCCESS(status)); // areConfusable: 1 (success: 1)
+ * uspoof_close(sc);
+ * free(skel1);
+ * free(skel2);
+ * \endcode
*
- * Note on Scripts:
- * Characters from the Unicode Scripts "Common" and "Inherited" are ignored when considering
- * the script of an identifier. Common characters include digits and symbols that
- * are normally used with text from more than one script.
+ * <p>
+ * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling
+ * {uspoof_areConfusable} many times in a loop, {uspoof_getSkeleton} can be used instead, as shown below:
*
- * Identifier Skeletons: A skeleton is a transformation of an identifier, such that
- * all identifiers that are confusable with each other have the same skeleton.
- * Using skeletons, it is possible to build a dictionary data structure for
- * a set of identifiers, and then quickly test whether a new identifier is
- * confusable with an identifier already in the set. The uspoof_getSkeleton()
- * family of functions will produce the skeleton from an identifier.
+ * \code{.c}
+ * // Setup:
+ * UErrorCode status = U_ZERO_ERROR;
+ * UChar* dictionary[2] = { (UChar*) u"lorem", (UChar*) u"ipsum" };
+ * UChar* skeletons[sizeof(dictionary)/sizeof(UChar*)];
+ * int32_t skeletonLengths[sizeof(dictionary)/sizeof(UChar*)];
+ * UChar* str = (UChar*) u"1orern";
*
- * Note that skeletons are not guaranteed to be stable between versions
- * of Unicode or ICU, so an applications should not rely on creating a permanent,
- * or difficult to update, database of skeletons. Instabilities result from
- * identifying new pairs or sequences of characters that are visually
- * confusable, and thus must be mapped to the same skeleton character(s).
+ * // Setup:
+ * USpoofChecker* sc = uspoof_open(&status);
+ * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
+ * for (size_t i=0; i<sizeof(dictionary)/sizeof(UChar*); i++) {
+ * UChar* word = dictionary[i];
+ * int32_t len = uspoof_getSkeleton(sc, 0, word, -1, NULL, 0, &status);
+ * skeletons[i] = (UChar*) malloc(len * sizeof(UChar));
+ * skeletonLengths[i] = len;
+ * status = U_ZERO_ERROR;
+ * uspoof_getSkeleton(sc, 0, word, -1, skeletons[i], len, &status);
+ * }
*
- * Skeletons are computed using the algorithm and data describe in Unicode UAX 39.
- * The latest proposed update, UAX 39 Version 8 draft 1, says "the tables SL, SA, and ML
- * were still problematic, and discouraged from use in [Uniocde] 7.0.
- * They were thus removed from version 8.0"
+ * // Live Check:
+ * {
+ * int32_t len = uspoof_getSkeleton(sc, 0, str, -1, NULL, 0, &status);
+ * UChar* skel = (UChar*) malloc(len * sizeof(UChar));
+ * status = U_ZERO_ERROR;
+ * uspoof_getSkeleton(sc, 0, str, -1, skel, len, &status);
+ * UBool result = FALSE;
+ * for (size_t i=0; i<sizeof(dictionary)/sizeof(UChar*); i++) {
+ * if (len == skeletonLengths[i] && memcmp(skel, skeletons[i], len) == 0) {
+ * result = TRUE;
+ * }
+ * }
+ * // Has confusable in dictionary: 1 (success: 1)
+ * printf("Has confusable in dictionary: %d (success: %d)\n", result, U_SUCCESS(status));
+ * free(skel);
+ * }
*
- * In light of this, the default mapping data included with ICU 55 uses the
- * Unicode 7 MA (Multi script Any case) table data for the other type options
- * (Single Script, Any Case), (Single Script, Lower Case) and (Multi Script, Lower Case).
+ * // Cleanup:
+ * for (size_t i=0; i<sizeof(dictionary)/sizeof(UChar*); i++) {
+ * free(skeletons[i]);
+ * }
+ * uspoof_close(sc);
+ * \endcode
+ *
+ * <p>
+ * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em>
+ * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons
+ * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons.
+ *
+ * <h2>Spoof Detection</h2>
+ *
+ * <p>
+ * The following snippet shows a minimal example of using <code>USpoofChecker</code> to perform spoof detection on a
+ * string:
+ *
+ * \code{.c}
+ * UErrorCode status = U_ZERO_ERROR;
+ * UChar* str = (UChar*) u"pаypаl"; // with Cyrillic 'а' characters
+ *
+ * // Get the default set of allowable characters:
+ * USet* allowed = uset_openEmpty();
+ * uset_addAll(allowed, uspoof_getRecommendedSet(&status));
+ * uset_addAll(allowed, uspoof_getInclusionSet(&status));
+ *
+ * USpoofChecker* sc = uspoof_open(&status);
+ * uspoof_setAllowedChars(sc, allowed, &status);
+ * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE);
+ *
+ * int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status);
+ * UBool result = bitmask != 0;
+ * printf("fails checks: %d (success: %d)\n", result, U_SUCCESS(status)); // fails checks: 1 (success: 1)
+ * uspoof_close(sc);
+ * uset_close(allowed);
+ * \endcode
+ *
+ * <p>
+ * As in the case for confusability checking, it is good practice to create one <code>USpoofChecker</code> instance at
+ * startup, and call the cheaper {@link uspoof_check} online. We specify the set of
+ * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39.
+ *
+ * <p>
+ * In addition to {@link uspoof_check}, the function {@link uspoof_checkUTF8} is exposed for UTF8-encoded char* strings,
+ * and {@link uspoof_checkUnicodeString} is exposed for C++ programmers.
+ *
+ * <p>
+ * If the {@link USPOOF_AUX_INFO} check is enabled, a limited amount of information on why a string failed the checks
+ * is available in the returned bitmask. For complete information, use the {@link uspoof_check2} class of functions
+ * with a {@link USpoofCheckResult} parameter:
+ *
+ * \code{.c}
+ * UErrorCode status = U_ZERO_ERROR;
+ * UChar* str = (UChar*) u"pаypаl"; // with Cyrillic 'а' characters
+ *
+ * // Get the default set of allowable characters:
+ * USet* allowed = uset_openEmpty();
+ * uset_addAll(allowed, uspoof_getRecommendedSet(&status));
+ * uset_addAll(allowed, uspoof_getInclusionSet(&status));
+ *
+ * USpoofChecker* sc = uspoof_open(&status);
+ * uspoof_setAllowedChars(sc, allowed, &status);
+ * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE);
+ *
+ * USpoofCheckResult* checkResult = uspoof_openCheckResult(&status);
+ * int32_t bitmask = uspoof_check2(sc, str, -1, checkResult, &status);
+ *
+ * int32_t failures1 = bitmask;
+ * int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status);
+ * assert(failures1 == failures2);
+ * // checks that failed: 16 (success: 1)
+ * printf("checks that failed: %d (success: %d)\n", failures1, U_SUCCESS(status));
+ *
+ * // Cleanup:
+ * uspoof_close(sc);
+ * uset_close(allowed);
+ * uspoof_closeCheckResult(checkResult);
+ * \endcode
+ *
+ * C++ users can take advantage of a few syntactical conveniences. The following snippet is functionally
+ * equivalent to the one above:
+ *
+ * \code{.cpp}
+ * UErrorCode status = U_ZERO_ERROR;
+ * UnicodeString str((UChar*) u"pаypаl"); // with Cyrillic 'а' characters
+ *
+ * // Get the default set of allowable characters:
+ * UnicodeSet allowed;
+ * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status));
+ * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status));
+ *
+ * LocalUSpoofCheckerPointer sc(uspoof_open(&status));
+ * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status);
+ * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE);
+ *
+ * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status));
+ * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
+ *
+ * int32_t failures1 = bitmask;
+ * int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status);
+ * assert(failures1 == failures2);
+ * // checks that failed: 16 (success: 1)
+ * printf("checks that failed: %d (success: %d)\n", failures1, U_SUCCESS(status));
+ *
+ * // Explicit cleanup not necessary.
+ * \endcode
+ *
+ * <p>
+ * The return value is a bitmask of the checks that failed. In this case, there was one check that failed:
+ * {@link USPOOF_RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are:
+ *
+ * <ul>
+ * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the
+ * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS
+ * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li>
+ * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character
+ * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li>
+ * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable
+ * characters. See {@link uspoof_setAllowedChars} and {@link uspoof_setAllowedLocales}.</li>
+ * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li>
+ * </ul>
+ *
+ * <p>
+ * These checks can be enabled independently of each other. For example, if you were interested in checking for only the
+ * INVISIBLE and MIXED_NUMBERS conditions, you could do:
+ *
+ * \code{.c}
+ * UErrorCode status = U_ZERO_ERROR;
+ * UChar* str = (UChar*) u"৪8";
+ *
+ * USpoofChecker* sc = uspoof_open(&status);
+ * uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status);
+ *
+ * int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status);
+ * UBool result = bitmask != 0;
+ * printf("fails checks: %d (success: %d)\n", result, U_SUCCESS(status)); // fails checks: 1 (success: 1)
+ * uspoof_close(sc);
+ * \endcode
+ *
+ * <p>
+ * Here is an example in C++ showing how to compute the restriction level of a string:
+ *
+ * \code{.cpp}
+ * UErrorCode status = U_ZERO_ERROR;
+ * UnicodeString str((UChar*) u"pаypаl"); // with Cyrillic 'а' characters
+ *
+ * // Get the default set of allowable characters:
+ * UnicodeSet allowed;
+ * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status));
+ * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status));
+ *
+ * LocalUSpoofCheckerPointer sc(uspoof_open(&status));
+ * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status);
+ * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE);
+ * uspoof_setChecks(sc.getAlias(), USPOOF_RESTRICTION_LEVEL | USPOOF_AUX_INFO, &status);
+ *
+ * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status));
+ * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
+ *
+ * URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status);
+ * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available via the bitmask:
+ * assert((restrictionLevel & bitmask) == restrictionLevel);
+ * // Restriction level: 1342177280 (success: 1)
+ * printf("Restriction level: %d (success: %d)\n", restrictionLevel, U_SUCCESS(status));
+ * \endcode
+ *
+ * <p>
+ * The code '1342177280' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE. Since
+ * USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check.
+ *
+ * <p>
+ * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in
+ * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings
+ * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have
+ * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is
+ * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed
+ * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on
+ * the levels, see UTS 39 or {@link URestrictionLevel}. The Restriction Level test is aware of the set of
+ * allowed characters set in {@link uspoof_setAllowedChars}. Note that characters which have script code
+ * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple
+ * scripts.
+ *
+ * <h2>Additional Information</h2>
+ *
+ * <p>
+ * A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
+ *
+ * <p>
+ * <b>Thread Safety:</b> Thread Safety: The test functions for checking a single identifier, or for testing whether
+ * two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads,
+ * using the same USpoofChecker instance.
+ *
+ * <p>
+ * More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are
+ * thread safe. Those that take a non-const USpoofChecier are not thread safe..
+ *
+ * @stable ICU 4.6
*/
struct USpoofChecker;
typedef struct USpoofChecker USpoofChecker; /**< typedef for C of USpoofChecker */
+#ifndef U_HIDE_DRAFT_API
+/**
+ * @see uspoof_openCheckResult
+ */
+struct USpoofCheckResult;
+/**
+ * @see uspoof_openCheckResult
+ */
+typedef struct USpoofCheckResult USpoofCheckResult;
+#endif /* U_HIDE_DRAFT_API */
+
/**
* Enum for the kinds of checks that USpoofChecker can perform.
* These enum values are used both to select the set of checks that
@@ -162,45 +384,63 @@
* @stable ICU 4.2
*/
typedef enum USpoofChecks {
- /** Single script confusable test.
- * When testing whether two identifiers are confusable, report that they are if
- * both are from the same script and they are visually confusable.
- * Note: this test is not applicable to a check of a single identifier.
- */
+ /**
+ * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
+ * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section
+ * 4.
+ *
+ * @see uspoof_areConfusable
+ * @stable ICU 4.2
+ */
USPOOF_SINGLE_SCRIPT_CONFUSABLE = 1,
- /** Mixed script confusable test.
- * When checking a single identifier, report a problem if
- * the identifier contains multiple scripts, and
- * is confusable with some other identifier in a single script
- * When testing whether two identifiers are confusable, report that they are if
- * the two IDs are visually confusable,
- * and at least one contains characters from more than one script.
+ /**
+ * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
+ * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS
+ * 39 section 4.
+ *
+ * @see uspoof_areConfusable
+ * @stable ICU 4.2
*/
USPOOF_MIXED_SCRIPT_CONFUSABLE = 2,
- /** Whole script confusable test.
- * When checking a single identifier, report a problem if
- * The identifier is of a single script, and
- * there exists a confusable identifier in another script.
- * When testing whether two identifiers are confusable, report that they are if
- * each is of a single script,
- * the scripts of the two identifiers are different, and
- * the identifiers are visually confusable.
+ /**
+ * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
+ * that the two strings are visually confusable and that they are not from the same script but both of them are
+ * single-script strings, according to UTS 39 section 4.
+ *
+ * @see uspoof_areConfusable
+ * @stable ICU 4.2
*/
USPOOF_WHOLE_SCRIPT_CONFUSABLE = 4,
- /** Any Case Modifier for confusable identifier tests.
- If specified, consider all characters, of any case, when looking for confusables.
- If USPOOF_ANY_CASE is not specified, identifiers being checked are assumed to have been
- case folded. Upper case confusable characters will not be checked.
- Selects between Lower Case Confusable and
- Any Case Confusable. */
+ /**
+ * Enable this flag in {@link uspoof_setChecks} to turn on all types of confusables. You may set
+ * the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to
+ * make {@link uspoof_areConfusable} return only those types of confusables.
+ *
+ * <p>Note: if you wish to use {@link uspoof_getSkeleton}, it is required that you enable at least one of the
+ * CONFUSABLE flags.
+ *
+ * @see uspoof_areConfusable
+ * @see uspoof_getSkeleton
+ * @draft ICU 58
+ * @provisional This API might change or be removed in a future release.
+ */
+ USPOOF_CONFUSABLE = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE,
+
+#ifndef U_HIDE_DEPRECATED_API
+ /**
+ * This flag is deprecated and no longer affects the behavior of SpoofChecker.
+ *
+ * @deprecated ICU 58 This API was deprecated in UTS 39 Version 8 and is no longer used.
+ */
USPOOF_ANY_CASE = 8,
+#endif /* U_HIDE_DEPRECATED_API */
/**
* Check that an identifier is no looser than the specified RestrictionLevel.
- * The default if uspoof_setRestrctionLevel() is not called is HIGHLY_RESTRICTIVE.
+ * The default if {@link uspoof_setRestrictionLevel} is not called is HIGHLY_RESTRICTIVE.
*
* If USPOOF_AUX_INFO is enabled the actual restriction level of the
* identifier being tested will also be returned by uspoof_check().
@@ -231,14 +471,15 @@
USPOOF_INVISIBLE = 32,
/** Check that an identifier contains only characters from a specified set
- * of acceptable characters. See uspoof_setAllowedChars() and
- * uspoof_setAllowedLocales().
+ * of acceptable characters. See {@link uspoof_setAllowedChars}
+ * {@link uspoof_setAllowedLocales}. Note that a string that fails this check
+ * will also fail the {@link USPOOF_RESTRICTION_LEVEL} check.
*/
USPOOF_CHAR_LIMIT = 64,
/**
- * Check that an identifier does not include decimal digits from
- * more than one numbering system.
+ * Check that an identifier does not mix numbers from different numbering systems.
+ * For more information, see UTS 39 section 5.3.
*
* @stable ICU 51
*/
@@ -255,11 +496,11 @@
* Enable the return of auxillary (non-error) information in the
* upper bits of the check results value.
*
- * If this "check" is not enabled, the results of uspoof_check() will be zero when an
- * identifier passes all of the enabled checks.
+ * If this "check" is not enabled, the results of {@link uspoof_check} will be
+ * zero when an identifier passes all of the enabled checks.
*
- * If this "check" is enabled, (uspoof_check() & USPOOF_ALL_CHECKS) will be zero
- * when an identifier passes all checks.
+ * If this "check" is enabled, (uspoof_check() & {@link USPOOF_ALL_CHECKS}) will
+ * be zero when an identifier passes all checks.
*
* @stable ICU 51
*/
@@ -269,39 +510,53 @@
/**
- * Constants from UAX #39 for use in setRestrictionLevel(), and
+ * Constants from UAX #39 for use in {@link uspoof_setRestrictionLevel}, and
* for returned identifier restriction levels in check results.
+ *
* @stable ICU 51
+ *
+ * @see uspoof_setRestrictionLevel
+ * @see uspoof_check
*/
typedef enum URestrictionLevel {
/**
- * Only ASCII characters: U+0000..U+007F
+ * All characters in the string are in the identifier profile and all characters in the string are in the
+ * ASCII range.
*
* @stable ICU 51
*/
USPOOF_ASCII = 0x10000000,
/**
- * All characters in each identifier must be from a single script.
- *
- * @stable ICU 53
- */
+ * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and
+ * the string is single-script, according to the definition in UTS 39 section 5.1.
+ *
+ * @stable ICU 53
+ */
USPOOF_SINGLE_SCRIPT_RESTRICTIVE = 0x20000000,
/**
- * All characters in each identifier must be from a single script, or from the combinations: Latin + Han +
- * Hiragana + Katakana; Latin + Han + Bopomofo; or Latin + Han + Hangul. Note that this level will satisfy the
- * vast majority of Latin-script users; also that TR36 has ASCII instead of Latin.
+ * The string classifies as Single Script, or all characters in the string are in the identifier profile and
+ * the string is covered by any of the following sets of scripts, according to the definition in UTS 39
+ * section 5.1:
+ * <ul>
+ * <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li>
+ * <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li>
+ * <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li>
+ * </ul>
+ * This is the default restriction in ICU.
*
* @stable ICU 51
*/
USPOOF_HIGHLY_RESTRICTIVE = 0x30000000,
/**
- * Allow Latin with other scripts except Cyrillic, Greek, Cherokee Otherwise, the same as Highly Restrictive
+ * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile
+ * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic,
+ * Greek, and Cherokee.
*
* @stable ICU 51
*/
USPOOF_MODERATELY_RESTRICTIVE = 0x40000000,
/**
- * Allow arbitrary mixtures of scripts. Otherwise, the same as Moderately Restrictive.
+ * All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts.
*
* @stable ICU 51
*/
@@ -313,11 +568,16 @@
*/
USPOOF_UNRESTRICTIVE = 0x60000000,
/**
- * Mask for selecting the Restriction Level bits from the return value of uspoof_check().
- *
- * @stable ICU 53
- */
- USPOOF_RESTRICTION_LEVEL_MASK = 0x7F000000
+ * Mask for selecting the Restriction Level bits from the return value of {@link uspoof_check}.
+ *
+ * @stable ICU 53
+ */
+ USPOOF_RESTRICTION_LEVEL_MASK = 0x7F000000,
+ /**
+ * An undefined restriction level.
+ * @internal
+ */
+ USPOOF_UNDEFINED_RESTRICTIVE = -1,
} URestrictionLevel;
/**
@@ -361,10 +621,10 @@
/**
* Open a Spoof Checker from the source form of the spoof data.
- * The two inputs correspond to the Unicode data files confusables.txt
- * and confusablesWholeScript.txt as described in Unicode UAX #39.
- * The syntax of the source data is as described in UAX #39 for
- * these files, and the content of these files is acceptable input.
+ * The input corresponds to the Unicode data file confusables.txt
+ * as described in Unicode UAX #39. The syntax of the source data
+ * is as described in UAX #39 for this file, and the content of
+ * this file is acceptable input.
*
* The character encoding of the (char *) input text is UTF-8.
*
@@ -373,10 +633,9 @@
* @param confusablesLen The length of the confusables text, or -1 if the
* input string is zero terminated.
* @param confusablesWholeScript
- * a pointer to the whole script confusables definitions,
- * as found in the file confusablesWholeScript.txt from unicode.org.
- * @param confusablesWholeScriptLen The length of the whole script confusables text, or
- * -1 if the input string is zero terminated.
+ * Deprecated in ICU 58. No longer used.
+ * @param confusablesWholeScriptLen
+ * Deprecated in ICU 58. No longer used.
* @param errType In the event of an error in the input, indicates
* which of the input files contains the error.
* The value is one of USPOOF_SINGLE_SCRIPT_CONFUSABLE or
@@ -437,8 +696,33 @@
/**
- * Specify the set of checks that will be performed by the check
- * functions of this Spoof Checker.
+ * Specify the bitmask of checks that will be performed by {@link uspoof_check}. Calling this method
+ * overwrites any checks that may have already been enabled. By default, all checks are enabled.
+ *
+ * To enable specific checks and disable all others, the "whitelisted" checks should be ORed together. For
+ * example, to fail strings containing characters outside of the set specified by {@link uspoof_setAllowedChars} and
+ * also strings that contain digits from mixed numbering systems:
+ *
+ * <pre>
+ * {@code
+ * uspoof_setChecks(USPOOF_CHAR_LIMIT | USPOOF_MIXED_NUMBERS);
+ * }
+ * </pre>
+ *
+ * To disable specific checks and enable all others, the "blacklisted" checks should be ANDed away from
+ * ALL_CHECKS. For example, if you are not planning to use the {@link uspoof_areConfusable} functionality,
+ * it is good practice to disable the CONFUSABLE check:
+ *
+ * <pre>
+ * {@code
+ * uspoof_setChecks(USPOOF_ALL_CHECKS & ~USPOOF_CONFUSABLE);
+ * }
+ * </pre>
+ *
+ * Note that methods such as {@link uspoof_setAllowedChars}, {@link uspoof_setAllowedLocales}, and
+ * {@link uspoof_setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they
+ * enable onto the existing bitmask specified by this method. For more details, see the documentation of those
+ * methods.
*
* @param sc The USpoofChecker
* @param checks The set of checks that this spoof checker will perform.
@@ -466,9 +750,10 @@
uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status);
/**
- * Set the loosest restriction level allowed. The default if this function
- * is not called is HIGHLY_RESTRICTIVE.
- * Calling this function also enables the RESTRICTION_LEVEL check.
+ * Set the loosest restriction level allowed for strings. The default if this is not called is
+ * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and
+ * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
+ * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}.
* @param restrictionLevel The loosest restriction level allowed.
* @see URestrictionLevel
* @stable ICU 51
@@ -478,7 +763,7 @@
/**
- * Get the Restriction Level that will be tested if the checks include RESTRICTION_LEVEL.
+ * Get the Restriction Level that will be tested if the checks include {@link USPOOF_RESTRICTION_LEVEL}.
*
* @return The restriction level
* @see URestrictionLevel
@@ -501,7 +786,7 @@
* Supplying an empty string removes all restrictions;
* characters from any script will be allowed.
*
- * The USPOOF_CHAR_LIMIT test is automatically enabled for this
+ * The {@link USPOOF_CHAR_LIMIT} test is automatically enabled for this
* USpoofChecker when calling this function with a non-empty list
* of locales.
*
@@ -513,7 +798,7 @@
* can be made to the result of uspoof_setAllowedLocales() by
* fetching the resulting set with uspoof_getAllowedChars(),
* manipulating it with the Unicode Set API, then resetting the
- * spoof detectors limits with uspoof_setAllowedChars()
+ * spoof detectors limits with uspoof_setAllowedChars().
*
* @param sc The USpoofChecker
* @param localesList A list list of locales, from which the language
@@ -656,16 +941,21 @@
* The text to be checked will typically be an identifier of some sort.
* The set of checks to be performed is specified with uspoof_setChecks().
*
+ * \note
+ * Consider using the newer API, {@link uspoof_check2}, instead.
+ * The newer API exposes additional information from the check procedure
+ * and is otherwise identical to this method.
+ *
* @param sc The USpoofChecker
* @param id The identifier to be checked for possible security issues,
* in UTF-16 format.
* @param length the length of the string to be checked, expressed in
* 16 bit UTF-16 code units, or -1 if the string is
* zero terminated.
- * @param position An out parameter.
- * Originally, the index of the first string position that failed a check.
- * Now, always returns zero.
- * This parameter may be null.
+ * @param position Deprecated in ICU 51. Always returns zero.
+ * Originally, an out parameter for the index of the first
+ * string position that failed a check.
+ * This parameter may be NULL.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Spoofing or security issues detected with the input string are
@@ -675,6 +965,7 @@
* enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
* will be zero if the input string passes all of the
* enabled checks.
+ * @see uspoof_check2
* @stable ICU 4.2
*/
U_STABLE int32_t U_EXPORT2
@@ -689,15 +980,19 @@
* The text to be checked will typically be an identifier of some sort.
* The set of checks to be performed is specified with uspoof_setChecks().
*
+ * \note
+ * Consider using the newer API, {@link uspoof_check2UTF8}, instead.
+ * The newer API exposes additional information from the check procedure
+ * and is otherwise identical to this method.
+ *
* @param sc The USpoofChecker
* @param id A identifier to be checked for possible security issues, in UTF8 format.
* @param length the length of the string to be checked, or -1 if the string is
* zero terminated.
- * @param position An out parameter.
- * Originally, the index of the first string position that failed a check.
- * Now, always returns zero.
- * This parameter may be null.
- * @deprecated ICU 51
+ * @param position Deprecated in ICU 51. Always returns zero.
+ * Originally, an out parameter for the index of the first
+ * string position that failed a check.
+ * This parameter may be NULL.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Spoofing or security issues detected with the input string are
@@ -709,6 +1004,7 @@
* enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
* will be zero if the input string passes all of the
* enabled checks.
+ * @see uspoof_check2UTF8
* @stable ICU 4.2
*/
U_STABLE int32_t U_EXPORT2
@@ -724,13 +1020,17 @@
* The text to be checked will typically be an identifier of some sort.
* The set of checks to be performed is specified with uspoof_setChecks().
*
+ * \note
+ * Consider using the newer API, {@link uspoof_check2UnicodeString}, instead.
+ * The newer API exposes additional information from the check procedure
+ * and is otherwise identical to this method.
+ *
* @param sc The USpoofChecker
* @param id A identifier to be checked for possible security issues.
- * @param position An out parameter.
- * Originally, the index of the first string position that failed a check.
- * Now, always returns zero.
- * This parameter may be null.
- * @deprecated ICU 51
+ * @param position Deprecated in ICU 51. Always returns zero.
+ * Originally, an out parameter for the index of the first
+ * string position that failed a check.
+ * This parameter may be NULL.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Spoofing or security issues detected with the input string are
@@ -740,6 +1040,7 @@
* enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
* will be zero if the input string passes all of the
* enabled checks.
+ * @see uspoof_check2UnicodeString
* @stable ICU 4.2
*/
U_STABLE int32_t U_EXPORT2
@@ -747,26 +1048,224 @@
const icu::UnicodeString &id,
int32_t *position,
UErrorCode *status);
-
#endif
/**
+ * Check the specified string for possible security issues.
+ * The text to be checked will typically be an identifier of some sort.
+ * The set of checks to be performed is specified with uspoof_setChecks().
+ *
+ * @param sc The USpoofChecker
+ * @param id The identifier to be checked for possible security issues,
+ * in UTF-16 format.
+ * @param checkResult An instance of USpoofCheckResult to be filled with
+ * details about the identifier. Can be NULL.
+ * @param status The error code, set if an error occurred while attempting to
+ * perform the check.
+ * Spoofing or security issues detected with the input string are
+ * not reported here, but through the function's return value.
+ * @return An integer value with bits set for any potential security
+ * or spoofing issues detected. The bits are defined by
+ * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
+ * will be zero if the input string passes all of the
+ * enabled checks. Any information in this bitmask will be
+ * consistent with the information saved in the optional
+ * checkResult parameter.
+ * @see uspoof_openCheckResult
+ * @see uspoof_check2UTF8
+ * @see uspoof_check2UnicodeString
+ * @draft ICU 58
+ */
+U_DRAFT int32_t U_EXPORT2
+uspoof_check2(const USpoofChecker *sc,
+ const UChar* id, int32_t length,
+ USpoofCheckResult* checkResult,
+ UErrorCode *status);
+
+/**
+ * Check the specified string for possible security issues.
+ * The text to be checked will typically be an identifier of some sort.
+ * The set of checks to be performed is specified with uspoof_setChecks().
+ *
+ * This version of {@link uspoof_check} accepts a USpoofCheckResult, which
+ * returns additional information about the identifier. For more
+ * information, see {@link uspoof_openCheckResult}.
+ *
+ * @param sc The USpoofChecker
+ * @param id A identifier to be checked for possible security issues, in UTF8 format.
+ * @param length the length of the string to be checked, or -1 if the string is
+ * zero terminated.
+ * @param checkResult An instance of USpoofCheckResult to be filled with
+ * details about the identifier. Can be NULL.
+ * @param status The error code, set if an error occurred while attempting to
+ * perform the check.
+ * Spoofing or security issues detected with the input string are
+ * not reported here, but through the function's return value.
+ * @return An integer value with bits set for any potential security
+ * or spoofing issues detected. The bits are defined by
+ * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
+ * will be zero if the input string passes all of the
+ * enabled checks. Any information in this bitmask will be
+ * consistent with the information saved in the optional
+ * checkResult parameter.
+ * @see uspoof_openCheckResult
+ * @see uspoof_check2
+ * @see uspoof_check2UnicodeString
+ * @draft ICU 58
+ */
+U_DRAFT int32_t U_EXPORT2
+uspoof_check2UTF8(const USpoofChecker *sc,
+ const char *id, int32_t length,
+ USpoofCheckResult* checkResult,
+ UErrorCode *status);
+
+#if U_SHOW_CPLUSPLUS_API
+/**
+ * Check the specified string for possible security issues.
+ * The text to be checked will typically be an identifier of some sort.
+ * The set of checks to be performed is specified with uspoof_setChecks().
+ *
+ * @param sc The USpoofChecker
+ * @param id A identifier to be checked for possible security issues.
+ * @param checkResult An instance of USpoofCheckResult to be filled with
+ * details about the identifier. Can be NULL.
+ * @param status The error code, set if an error occurred while attempting to
+ * perform the check.
+ * Spoofing or security issues detected with the input string are
+ * not reported here, but through the function's return value.
+ * @return An integer value with bits set for any potential security
+ * or spoofing issues detected. The bits are defined by
+ * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
+ * will be zero if the input string passes all of the
+ * enabled checks. Any information in this bitmask will be
+ * consistent with the information saved in the optional
+ * checkResult parameter.
+ * @see uspoof_openCheckResult
+ * @see uspoof_check2
+ * @see uspoof_check2UTF8
+ * @draft ICU 58
+ */
+U_DRAFT int32_t U_EXPORT2
+uspoof_check2UnicodeString(const USpoofChecker *sc,
+ const icu::UnicodeString &id,
+ USpoofCheckResult* checkResult,
+ UErrorCode *status);
+#endif
+
+/**
+ * Create a USpoofCheckResult, used by the {@link uspoof_check2} class of functions to return
+ * information about the identifier. Information includes:
+ * <ul>
+ * <li>A bitmask of the checks that failed</li>
+ * <li>The identifier's restriction level (UTS 39 section 5.2)</li>
+ * <li>The set of numerics in the string (UTS 39 section 5.3)</li>
+ * </ul>
+ * The data held in a USpoofCheckResult is cleared whenever it is passed into a new call
+ * of {@link uspoof_check2}.
+ *
+ * @param status The error code, set if this function encounters a problem.
+ * @return the newly created USpoofCheckResult
+ * @see uspoof_check2
+ * @see uspoof_check2UTF8
+ * @see uspoof_check2UnicodeString
+ * @draft ICU 58
+ */
+U_DRAFT USpoofCheckResult* U_EXPORT2
+uspoof_openCheckResult(UErrorCode *status);
+
+/**
+ * Close a USpoofCheckResult, freeing any memory that was being held by
+ * its implementation.
+ *
+ * @param checkResult The instance of USpoofCheckResult to close
+ * @draft ICU 58
+ */
+U_DRAFT void U_EXPORT2
+uspoof_closeCheckResult(USpoofCheckResult *checkResult);
+
+#if U_SHOW_CPLUSPLUS_API
+
+U_NAMESPACE_BEGIN
+
+/**
+ * \class LocalUSpoofCheckResultPointer
+ * "Smart pointer" class, closes a USpoofCheckResult via {@link uspoof_closeCheckResult}.
+ * For most methods see the LocalPointerBase base class.
+ *
+ * @see LocalPointerBase
+ * @see LocalPointer
+ * @draft ICU 58
+ */
+U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckResultPointer, USpoofCheckResult, uspoof_closeCheckResult);
+
+U_NAMESPACE_END
+
+#endif
+
+/**
+ * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests
+ * in question: USPOOF_RESTRICTION_LEVEL, USPOOF_CHAR_LIMIT, and so on.
+ *
+ * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
+ * @param status The error code, set if an error occurred.
+ * @return An integer value with bits set for any potential security
+ * or spoofing issues detected. The bits are defined by
+ * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
+ * will be zero if the input string passes all of the
+ * enabled checks.
+ * @see uspoof_setChecks
+ * @draft ICU 58
+ */
+U_DRAFT int32_t U_EXPORT2
+uspoof_getCheckResultChecks(const USpoofCheckResult *checkResult, UErrorCode *status);
+
+/**
+ * Gets the restriction level that the text meets, if the USPOOF_RESTRICTION_LEVEL check
+ * was enabled; otherwise, undefined.
+ *
+ * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
+ * @param status The error code, set if an error occurred.
+ * @return The restriction level contained in the USpoofCheckResult
+ * @see uspoof_setRestrictionLevel
+ * @draft ICU 58
+ */
+U_DRAFT URestrictionLevel U_EXPORT2
+uspoof_getCheckResultRestrictionLevel(const USpoofCheckResult *checkResult, UErrorCode *status);
+
+/**
+ * Gets the set of numerics found in the string, if the USPOOF_MIXED_NUMBERS check was enabled;
+ * otherwise, undefined. Ownership of the returned USet remains with the USpoofCheckResult.
+ * The USet will be free'd when {@link uspoof_closeCheckResult} is called.
+ *
+ * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
+ * @return The set of numerics contained in the USpoofCheckResult
+ * @param status The error code, set if an error occurred.
+ * @draft ICU 58
+ */
+U_DRAFT const USet* U_EXPORT2
+uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *status);
+
+
+/**
* Check the whether two specified strings are visually confusable.
- * The types of confusability to be tested - single script, mixed script,
- * or whole script - are determined by the check options set for the
- * USpoofChecker.
*
- * The tests to be performed are controlled by the flags
- * USPOOF_SINGLE_SCRIPT_CONFUSABLE
- * USPOOF_MIXED_SCRIPT_CONFUSABLE
- * USPOOF_WHOLE_SCRIPT_CONFUSABLE
- * At least one of these tests must be selected.
+ * If the strings are confusable, the return value will be nonzero, as long as
+ * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
*
- * USPOOF_ANY_CASE is a modifier for the tests. Select it if the identifiers
- * may be of mixed case.
- * If identifiers are case folded for comparison and
- * display to the user, do not select the USPOOF_ANY_CASE option.
+ * The bits in the return value correspond to flags for each of the classes of
+ * confusables applicable to the two input strings. According to UTS 39
+ * section 4, the possible flags are:
+ *
+ * <ul>
+ * <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
+ * <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE</li>
+ * <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
+ * </ul>
+ *
+ * If one or more of the above flags were not listed in uspoof_setChecks(), this
+ * function will never report that class of confusable. The check
+ * {@link USPOOF_CONFUSABLE} enables all three flags.
*
*
* @param sc The USpoofChecker
@@ -788,6 +1287,7 @@
* the type of confusability found, as defined by
* enum USpoofChecks. Zero is returned if the identifiers
* are not confusable.
+ *
* @stable ICU 4.2
*/
U_STABLE int32_t U_EXPORT2
@@ -799,10 +1299,7 @@
/**
- * Check the whether two specified strings are visually confusable.
- * The types of confusability to be tested - single script, mixed script,
- * or whole script - are determined by the check options set for the
- * USpoofChecker.
+ * A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format.
*
* @param sc The USpoofChecker
* @param id1 The first of the two identifiers to be compared for
@@ -821,7 +1318,10 @@
* the type of confusability found, as defined by
* enum USpoofChecks. Zero is returned if the strings
* are not confusable.
+ *
* @stable ICU 4.2
+ *
+ * @see uspoof_areConfusable
*/
U_STABLE int32_t U_EXPORT2
uspoof_areConfusableUTF8(const USpoofChecker *sc,
@@ -834,10 +1334,7 @@
#if U_SHOW_CPLUSPLUS_API
/**
- * Check the whether two specified strings are visually confusable.
- * The types of confusability to be tested - single script, mixed script,
- * or whole script - are determined by the check options set for the
- * USpoofChecker.
+ * A version of {@link uspoof_areConfusable} accepting UnicodeStrings.
*
* @param sc The USpoofChecker
* @param s1 The first of the two identifiers to be compared for
@@ -852,7 +1349,10 @@
* the type of confusability found, as defined by
* enum USpoofChecks. Zero is returned if the identifiers
* are not confusable.
+ *
* @stable ICU 4.2
+ *
+ * @see uspoof_areConfusable
*/
U_STABLE int32_t U_EXPORT2
uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
@@ -863,37 +1363,36 @@
/**
- * Get the "skeleton" for an identifier.
- * Skeletons are a transformation of the input identifier;
- * Two identifiers are confusable if their skeletons are identical.
- * See Unicode UAX #39 for additional information.
- *
- * Using skeletons directly makes it possible to quickly check
- * whether an identifier is confusable with any of some large
- * set of existing identifiers, by creating an efficiently
- * searchable collection of the skeletons.
- *
- * @param sc The USpoofChecker
- * @param type The type of skeleton, corresponding to which
- * of the Unicode confusable data tables to use.
- * The default is Mixed-Script, Lowercase.
- * Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and
- * USPOOF_ANY_CASE. The two flags may be ORed.
- * @param id The input identifier whose skeleton will be computed.
- * @param length The length of the input identifier, expressed in 16 bit
- * UTF-16 code units, or -1 if the string is zero terminated.
- * @param dest The output buffer, to receive the skeleton string.
- * @param destCapacity The length of the output buffer, in 16 bit units.
- * The destCapacity may be zero, in which case the function will
- * return the actual length of the skeleton.
- * @param status The error code, set if an error occurred while attempting to
- * perform the check.
- * @return The length of the skeleton string. The returned length
- * is always that of the complete skeleton, even when the
- * supplied buffer is too small (or of zero length)
- *
- * @stable ICU 4.2
- */
+ * Get the "skeleton" for an identifier.
+ * Skeletons are a transformation of the input identifier;
+ * Two identifiers are confusable if their skeletons are identical.
+ * See Unicode UAX #39 for additional information.
+ *
+ * Using skeletons directly makes it possible to quickly check
+ * whether an identifier is confusable with any of some large
+ * set of existing identifiers, by creating an efficiently
+ * searchable collection of the skeletons.
+ *
+ * @param sc The USpoofChecker
+ * @param type Deprecated in ICU 58. You may pass any number.
+ * Originally, controlled which of the Unicode confusable data
+ * tables to use.
+ * @param id The input identifier whose skeleton will be computed.
+ * @param length The length of the input identifier, expressed in 16 bit
+ * UTF-16 code units, or -1 if the string is zero terminated.
+ * @param dest The output buffer, to receive the skeleton string.
+ * @param destCapacity The length of the output buffer, in 16 bit units.
+ * The destCapacity may be zero, in which case the function will
+ * return the actual length of the skeleton.
+ * @param status The error code, set if an error occurred while attempting to
+ * perform the check.
+ * @return The length of the skeleton string. The returned length
+ * is always that of the complete skeleton, even when the
+ * supplied buffer is too small (or of zero length)
+ *
+ * @stable ICU 4.2
+ * @see uspoof_areConfusable
+ */
U_STABLE int32_t U_EXPORT2
uspoof_getSkeleton(const USpoofChecker *sc,
uint32_t type,
@@ -902,40 +1401,38 @@
UErrorCode *status);
/**
- * Get the "skeleton" for an identifier.
- * Skeletons are a transformation of the input identifier;
- * Two identifiers are confusable if their skeletons are identical.
- * See Unicode UAX #39 for additional information.
- *
- * Using skeletons directly makes it possible to quickly check
- * whether an identifier is confusable with any of some large
- * set of existing identifiers, by creating an efficiently
- * searchable collection of the skeletons.
- *
- * @param sc The USpoofChecker
- * @param type The type of skeleton, corresponding to which
- * of the Unicode confusable data tables to use.
- * The default is Mixed-Script, Lowercase.
- * Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and
- * USPOOF_ANY_CASE. The two flags may be ORed.
- * @param id The UTF-8 format identifier whose skeleton will be computed.
- * @param length The length of the input string, in bytes,
- * or -1 if the string is zero terminated.
- * @param dest The output buffer, to receive the skeleton string.
- * @param destCapacity The length of the output buffer, in bytes.
- * The destCapacity may be zero, in which case the function will
- * return the actual length of the skeleton.
- * @param status The error code, set if an error occurred while attempting to
- * perform the check. Possible Errors include U_INVALID_CHAR_FOUND
- * for invalid UTF-8 sequences, and
- * U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small
- * to hold the complete skeleton.
- * @return The length of the skeleton string, in bytes. The returned length
- * is always that of the complete skeleton, even when the
- * supplied buffer is too small (or of zero length)
- *
- * @stable ICU 4.2
- */
+ * Get the "skeleton" for an identifier.
+ * Skeletons are a transformation of the input identifier;
+ * Two identifiers are confusable if their skeletons are identical.
+ * See Unicode UAX #39 for additional information.
+ *
+ * Using skeletons directly makes it possible to quickly check
+ * whether an identifier is confusable with any of some large
+ * set of existing identifiers, by creating an efficiently
+ * searchable collection of the skeletons.
+ *
+ * @param sc The USpoofChecker
+ * @param type Deprecated in ICU 58. You may pass any number.
+ * Originally, controlled which of the Unicode confusable data
+ * tables to use.
+ * @param id The UTF-8 format identifier whose skeleton will be computed.
+ * @param length The length of the input string, in bytes,
+ * or -1 if the string is zero terminated.
+ * @param dest The output buffer, to receive the skeleton string.
+ * @param destCapacity The length of the output buffer, in bytes.
+ * The destCapacity may be zero, in which case the function will
+ * return the actual length of the skeleton.
+ * @param status The error code, set if an error occurred while attempting to
+ * perform the check. Possible Errors include U_INVALID_CHAR_FOUND
+ * for invalid UTF-8 sequences, and
+ * U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small
+ * to hold the complete skeleton.
+ * @return The length of the skeleton string, in bytes. The returned length
+ * is always that of the complete skeleton, even when the
+ * supplied buffer is too small (or of zero length)
+ *
+ * @stable ICU 4.2
+ */
U_STABLE int32_t U_EXPORT2
uspoof_getSkeletonUTF8(const USpoofChecker *sc,
uint32_t type,
@@ -945,30 +1442,28 @@
#if U_SHOW_CPLUSPLUS_API
/**
- * Get the "skeleton" for an identifier.
- * Skeletons are a transformation of the input identifier;
- * Two identifiers are confusable if their skeletons are identical.
- * See Unicode UAX #39 for additional information.
- *
- * Using skeletons directly makes it possible to quickly check
- * whether an identifier is confusable with any of some large
- * set of existing identifiers, by creating an efficiently
- * searchable collection of the skeletons.
- *
- * @param sc The USpoofChecker.
- * @param type The type of skeleton, corresponding to which
- * of the Unicode confusable data tables to use.
- * The default is Mixed-Script, Lowercase.
- * Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and
- * USPOOF_ANY_CASE. The two flags may be ORed.
- * @param id The input identifier whose skeleton will be computed.
- * @param dest The output identifier, to receive the skeleton string.
- * @param status The error code, set if an error occurred while attempting to
- * perform the check.
- * @return A reference to the destination (skeleton) string.
- *
- * @stable ICU 4.2
- */
+ * Get the "skeleton" for an identifier.
+ * Skeletons are a transformation of the input identifier;
+ * Two identifiers are confusable if their skeletons are identical.
+ * See Unicode UAX #39 for additional information.
+ *
+ * Using skeletons directly makes it possible to quickly check
+ * whether an identifier is confusable with any of some large
+ * set of existing identifiers, by creating an efficiently
+ * searchable collection of the skeletons.
+ *
+ * @param sc The USpoofChecker.
+ * @param type Deprecated in ICU 58. You may pass any number.
+ * Originally, controlled which of the Unicode confusable data
+ * tables to use.
+ * @param id The input identifier whose skeleton will be computed.
+ * @param dest The output identifier, to receive the skeleton string.
+ * @param status The error code, set if an error occurred while attempting to
+ * perform the check.
+ * @return A reference to the destination (skeleton) string.
+ *
+ * @stable ICU 4.2
+ */
U_I18N_API icu::UnicodeString & U_EXPORT2
uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
uint32_t type,
diff --git a/icu4c/source/i18n/uspoof.cpp b/icu4c/source/i18n/uspoof.cpp
index 243677b..9cb41e2 100644
--- a/icu4c/source/i18n/uspoof.cpp
+++ b/icu4c/source/i18n/uspoof.cpp
@@ -22,7 +22,6 @@
#include "unicode/utf16.h"
#include "cmemory.h"
#include "cstring.h"
-#include "identifier_info.h"
#include "mutex.h"
#include "scriptset.h"
#include "uassert.h"
@@ -42,9 +41,7 @@
static UnicodeSet *gInclusionSet = NULL;
static UnicodeSet *gRecommendedSet = NULL;
static const Normalizer2 *gNfdNormalizer = NULL;
-static SpoofData *gDefaultSpoofData = NULL;
static UInitOnce gSpoofInitStaticsOnce = U_INITONCE_INITIALIZER;
-static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;
static UBool U_CALLCONV
uspoof_cleanup(void) {
@@ -53,83 +50,78 @@
delete gRecommendedSet;
gRecommendedSet = NULL;
gNfdNormalizer = NULL;
- if (gDefaultSpoofData) {
- gDefaultSpoofData->removeReference(); // Will delete, assuming all user-level spoof checkers were closed.
- }
- gDefaultSpoofData = NULL;
gSpoofInitStaticsOnce.reset();
- gSpoofInitDefaultOnce.reset();
return TRUE;
}
static void U_CALLCONV initializeStatics(UErrorCode &status) {
static const char *inclusionPat =
- "[\\u0027\\u002D-\\u002E\\u003A\\u00B7\\u0375\\u058A\\u05F3-\\u05F4"
- "\\u06FD-\\u06FE\\u0F0B\\u200C-\\u200D\\u2010\\u2019\\u2027\\u30A0\\u30FB]";
+ "['\\-.\\:\\u00B7\\u0375\\u058A\\u05F3\\u05F4\\u06FD\\u06FE\\u0F0B\\u200C\\u200D\\u2010\\u"
+ "2019\\u2027\\u30A0\\u30FB]";
gInclusionSet = new UnicodeSet(UnicodeString(inclusionPat, -1, US_INV), status);
gInclusionSet->freeze();
-
- // Note: data from http://unicode.org/Public/security/latest/xidmodifications.txt version 8.0.0
- // There is no tooling to generate this from the .txt file, hand extracted with editor macros.
- // Ultimately, data will be available as character properties, eliminating this.
+
+ // Note: data from http://unicode.org/Public/security/9.0.0/IdentifierStatus.txt
+ // There is tooling to generate this constant in the unicodetools project:
+ // org.unicode.text.tools.RecommendedSetGenerator
+ // It will print the Java and C++ code to the console for easy copy-paste into this file.
// Note: concatenated string constants do not work with UNICODE_STRING_SIMPLE on all platforms.
static const char *recommendedPat =
- "[\\u0030-\\u0039\\u0041-\\u005A\\u005F\\u0061-\\u007A\\u00C0-\\u00D6\\u00D8-\\u00F6"
- "\\u00F8-\\u0131\\u0134-\\u013E\\u0141-\\u0148\\u014A-\\u017E\\u018F\\u01A0-\\u01A1"
- "\\u01AF-\\u01B0\\u01CD-\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F0\\u01F4-\\u01F5\\u01F8-\\u021B"
- "\\u021E-\\u021F\\u0226-\\u0233\\u0259\\u02BB-\\u02BC\\u02EC\\u0300-\\u0304\\u0306-\\u030C"
- "\\u030F-\\u0311\\u0313-\\u0314\\u031B\\u0323-\\u0328\\u032D-\\u032E\\u0330-\\u0331"
- "\\u0335\\u0338-\\u0339\\u0342\\u0345\\u037B-\\u037D\\u0386\\u0388-\\u038A\\u038C"
- "\\u038E-\\u03A1\\u03A3-\\u03CE\\u03FC-\\u045F\\u048A-\\u0529\\u052E-\\u052F\\u0531-\\u0556"
- "\\u0559\\u0561-\\u0586\\u05B4\\u05D0-\\u05EA\\u05F0-\\u05F2\\u0620-\\u063F\\u0641-\\u0655"
- "\\u0660-\\u0669\\u0670-\\u0672\\u0674\\u0679-\\u068D\\u068F-\\u06D3\\u06D5\\u06E5-\\u06E6"
- "\\u06EE-\\u06FC\\u06FF\\u0750-\\u07B1\\u08A0-\\u08AC\\u08B2\\u0901-\\u094D\\u094F-\\u0950"
- "\\u0956-\\u0957\\u0960-\\u0963\\u0966-\\u096F\\u0971-\\u0977\\u0979-\\u097F\\u0981-\\u0983"
- "\\u0985-\\u098C\\u098F-\\u0990\\u0993-\\u09A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9"
- "\\u09BC-\\u09C4\\u09C7-\\u09C8\\u09CB-\\u09CE\\u09D7\\u09E0-\\u09E3\\u09E6-\\u09F1"
- "\\u0A01-\\u0A03\\u0A05-\\u0A0A\\u0A0F-\\u0A10\\u0A13-\\u0A28\\u0A2A-\\u0A30\\u0A32"
- "\\u0A35\\u0A38-\\u0A39\\u0A3C\\u0A3E-\\u0A42\\u0A47-\\u0A48\\u0A4B-\\u0A4D\\u0A5C"
- "\\u0A66-\\u0A74\\u0A81-\\u0A83\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A93-\\u0AA8\\u0AAA-\\u0AB0"
- "\\u0AB2-\\u0AB3\\u0AB5-\\u0AB9\\u0ABC-\\u0AC5\\u0AC7-\\u0AC9\\u0ACB-\\u0ACD\\u0AD0"
- "\\u0AE0-\\u0AE3\\u0AE6-\\u0AEF\\u0B01-\\u0B03\\u0B05-\\u0B0C\\u0B0F-\\u0B10\\u0B13-\\u0B28"
- "\\u0B2A-\\u0B30\\u0B32-\\u0B33\\u0B35-\\u0B39\\u0B3C-\\u0B43\\u0B47-\\u0B48\\u0B4B-\\u0B4D"
- "\\u0B56-\\u0B57\\u0B5F-\\u0B61\\u0B66-\\u0B6F\\u0B71\\u0B82-\\u0B83\\u0B85-\\u0B8A"
- "\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99-\\u0B9A\\u0B9C\\u0B9E-\\u0B9F\\u0BA3-\\u0BA4"
- "\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0BD0"
- "\\u0BD7\\u0BE6-\\u0BEF\\u0C01-\\u0C03\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28"
- "\\u0C2A-\\u0C33\\u0C35-\\u0C39\\u0C3D-\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55-\\u0C56"
- "\\u0C60-\\u0C61\\u0C66-\\u0C6F\\u0C82-\\u0C83\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8"
- "\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBC-\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD\\u0CD5-\\u0CD6"
- "\\u0CE0-\\u0CE3\\u0CE6-\\u0CEF\\u0CF1-\\u0CF2\\u0D02-\\u0D03\\u0D05-\\u0D0C\\u0D0E-\\u0D10"
- "\\u0D12-\\u0D3A\\u0D3D-\\u0D43\\u0D46-\\u0D48\\u0D4A-\\u0D4E\\u0D57\\u0D60-\\u0D61"
- "\\u0D66-\\u0D6F\\u0D7A-\\u0D7F\\u0D82-\\u0D83\\u0D85-\\u0D8E\\u0D91-\\u0D96\\u0D9A-\\u0DA5"
- "\\u0DA7-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD\\u0DC0-\\u0DC6\\u0DCA\\u0DCF-\\u0DD4\\u0DD6"
- "\\u0DD8-\\u0DDE\\u0DF2\\u0E01-\\u0E32\\u0E34-\\u0E3A\\u0E40-\\u0E4E\\u0E50-\\u0E59"
- "\\u0E81-\\u0E82\\u0E84\\u0E87-\\u0E88\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F"
- "\\u0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA-\\u0EAB\\u0EAD-\\u0EB2\\u0EB4-\\u0EB9\\u0EBB-\\u0EBD"
- "\\u0EC0-\\u0EC4\\u0EC6\\u0EC8-\\u0ECD\\u0ED0-\\u0ED9\\u0EDE-\\u0EDF\\u0F00\\u0F20-\\u0F29"
- "\\u0F35\\u0F37\\u0F3E-\\u0F42\\u0F44-\\u0F47\\u0F49-\\u0F4C\\u0F4E-\\u0F51\\u0F53-\\u0F56"
- "\\u0F58-\\u0F5B\\u0F5D-\\u0F68\\u0F6A-\\u0F6C\\u0F71-\\u0F72\\u0F74\\u0F7A-\\u0F80"
- "\\u0F82-\\u0F84\\u0F86-\\u0F92\\u0F94-\\u0F97\\u0F99-\\u0F9C\\u0F9E-\\u0FA1\\u0FA3-\\u0FA6"
- "\\u0FA8-\\u0FAB\\u0FAD-\\u0FB8\\u0FBA-\\u0FBC\\u0FC6\\u1000-\\u1049\\u1050-\\u109D"
- "\\u10C7\\u10CD\\u10D0-\\u10F0\\u10F7-\\u10FA\\u10FD-\\u10FF\\u1200-\\u1248\\u124A-\\u124D"
- "\\u1250-\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0"
- "\\u12B2-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310"
- "\\u1312-\\u1315\\u1318-\\u135A\\u135D-\\u135F\\u1380-\\u138F\\u1780-\\u17A2\\u17A5-\\u17A7"
- "\\u17A9-\\u17B3\\u17B6-\\u17CA\\u17D2\\u17D7\\u17DC\\u17E0-\\u17E9\\u1E00-\\u1E99"
- "\\u1E9E\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D"
- "\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F70\\u1F72\\u1F74\\u1F76\\u1F78"
- "\\u1F7A\\u1F7C\\u1F80-\\u1FB4\\u1FB6-\\u1FBA\\u1FBC\\u1FC2-\\u1FC4\\u1FC6-\\u1FC8"
- "\\u1FCA\\u1FCC\\u1FD0-\\u1FD2\\u1FD6-\\u1FDA\\u1FE0-\\u1FE2\\u1FE4-\\u1FEA\\u1FEC"
- "\\u1FF2-\\u1FF4\\u1FF6-\\u1FF8\\u1FFA\\u1FFC\\u2D27\\u2D2D\\u2D80-\\u2D96\\u2DA0-\\u2DA6"
- "\\u2DA8-\\u2DAE\\u2DB0-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6"
- "\\u2DD8-\\u2DDE\\u3005-\\u3007\\u3041-\\u3096\\u3099-\\u309A\\u309D-\\u309E\\u30A1-\\u30FA"
- "\\u30FC-\\u30FE\\u3105-\\u312D\\u31A0-\\u31BA\\u3400-\\u4DB5\\u4E00-\\u9FD5\\uA660-\\uA661"
- "\\uA674-\\uA67B\\uA67F\\uA69F\\uA717-\\uA71F\\uA788\\uA78D-\\uA78E\\uA790-\\uA793"
- "\\uA7A0-\\uA7AA\\uA7FA\\uA9E7-\\uA9FE\\uAA60-\\uAA76\\uAA7A-\\uAA7F\\uAB01-\\uAB06"
- "\\uAB09-\\uAB0E\\uAB11-\\uAB16\\uAB20-\\uAB26\\uAB28-\\uAB2E\\uAC00-\\uD7A3\\uFA0E-\\uFA0F"
- "\\uFA11\\uFA13-\\uFA14\\uFA1F\\uFA21\\uFA23-\\uFA24\\uFA27-\\uFA29\\U00020000-\\U0002A6D6"
- "\\U0002A700-\\U0002B734\\U0002B740-\\U0002B81D\\U0002B820-\\U0002CEA1]";
+ "[0-9A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u0131\\u0134-\\u013E\\u0141-\\u014"
+ "8\\u014A-\\u017E\\u018F\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-\\u01DC\\u01DE-\\u01E3\\u01E"
+ "6-\\u01F0\\u01F4\\u01F5\\u01F8-\\u021B\\u021E\\u021F\\u0226-\\u0233\\u0259\\u02BB\\u02B"
+ "C\\u02EC\\u0300-\\u0304\\u0306-\\u030C\\u030F-\\u0311\\u0313\\u0314\\u031B\\u0323-\\u03"
+ "28\\u032D\\u032E\\u0330\\u0331\\u0335\\u0338\\u0339\\u0342\\u0345\\u037B-\\u037D\\u0386"
+ "\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03CE\\u03FC-\\u045F\\u048A-\\u0529\\u05"
+ "2E\\u052F\\u0531-\\u0556\\u0559\\u0561-\\u0586\\u05B4\\u05D0-\\u05EA\\u05F0-\\u05F2\\u0"
+ "620-\\u063F\\u0641-\\u0655\\u0660-\\u0669\\u0670-\\u0672\\u0674\\u0679-\\u068D\\u068F-"
+ "\\u06D3\\u06D5\\u06E5\\u06E6\\u06EE-\\u06FC\\u06FF\\u0750-\\u07B1\\u08A0-\\u08AC\\u08B2"
+ "\\u08B6-\\u08BD\\u0901-\\u094D\\u094F\\u0950\\u0956\\u0957\\u0960-\\u0963\\u0966-\\u096"
+ "F\\u0971-\\u0977\\u0979-\\u097F\\u0981-\\u0983\\u0985-\\u098C\\u098F\\u0990\\u0993-\\u0"
+ "9A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9\\u09BC-\\u09C4\\u09C7\\u09C8\\u09CB-\\u09CE\\u"
+ "09D7\\u09E0-\\u09E3\\u09E6-\\u09F1\\u0A01-\\u0A03\\u0A05-\\u0A0A\\u0A0F\\u0A10\\u0A13-"
+ "\\u0A28\\u0A2A-\\u0A30\\u0A32\\u0A35\\u0A38\\u0A39\\u0A3C\\u0A3E-\\u0A42\\u0A47\\u0A48\\"
+ "u0A4B-\\u0A4D\\u0A5C\\u0A66-\\u0A74\\u0A81-\\u0A83\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A9"
+ "3-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2\\u0AB3\\u0AB5-\\u0AB9\\u0ABC-\\u0AC5\\u0AC7-\\u0AC9\\u0"
+ "ACB-\\u0ACD\\u0AD0\\u0AE0-\\u0AE3\\u0AE6-\\u0AEF\\u0B01-\\u0B03\\u0B05-\\u0B0C\\u0B0F\\"
+ "u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30\\u0B32\\u0B33\\u0B35-\\u0B39\\u0B3C-\\u0B43\\u0B47"
+ "\\u0B48\\u0B4B-\\u0B4D\\u0B56\\u0B57\\u0B5F-\\u0B61\\u0B66-\\u0B6F\\u0B71\\u0B82\\u0B83"
+ "\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99\\u0B9A\\u0B9C\\u0B9E\\u0B9F\\u0BA3"
+ "\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0B"
+ "D0\\u0BD7\\u0BE6-\\u0BEF\\u0C01-\\u0C03\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28\\u"
+ "0C2A-\\u0C33\\u0C35-\\u0C39\\u0C3D-\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55\\u0C56"
+ "\\u0C60\\u0C61\\u0C66-\\u0C6F\\u0C80\\u0C82\\u0C83\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92"
+ "-\\u0CA8\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBC-\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD\\u0"
+ "CD5\\u0CD6\\u0CE0-\\u0CE3\\u0CE6-\\u0CEF\\u0CF1\\u0CF2\\u0D02\\u0D03\\u0D05-\\u0D0C\\u0"
+ "D0E-\\u0D10\\u0D12-\\u0D3A\\u0D3D-\\u0D43\\u0D46-\\u0D48\\u0D4A-\\u0D4E\\u0D54-\\u0D57"
+ "\\u0D60\\u0D61\\u0D66-\\u0D6F\\u0D7A-\\u0D7F\\u0D82\\u0D83\\u0D85-\\u0D8E\\u0D91-\\u0D9"
+ "6\\u0D9A-\\u0DA5\\u0DA7-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD\\u0DC0-\\u0DC6\\u0DCA\\u0DCF-\\u0"
+ "DD4\\u0DD6\\u0DD8-\\u0DDE\\u0DF2\\u0E01-\\u0E32\\u0E34-\\u0E3A\\u0E40-\\u0E4E\\u0E50-\\"
+ "u0E59\\u0E81\\u0E82\\u0E84\\u0E87\\u0E88\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F\\u"
+ "0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA\\u0EAB\\u0EAD-\\u0EB2\\u0EB4-\\u0EB9\\u0EBB-\\u0EBD\\"
+ "u0EC0-\\u0EC4\\u0EC6\\u0EC8-\\u0ECD\\u0ED0-\\u0ED9\\u0EDE\\u0EDF\\u0F00\\u0F20-\\u0F29"
+ "\\u0F35\\u0F37\\u0F3E-\\u0F42\\u0F44-\\u0F47\\u0F49-\\u0F4C\\u0F4E-\\u0F51\\u0F53-\\u0F"
+ "56\\u0F58-\\u0F5B\\u0F5D-\\u0F68\\u0F6A-\\u0F6C\\u0F71\\u0F72\\u0F74\\u0F7A-\\u0F80\\u0"
+ "F82-\\u0F84\\u0F86-\\u0F92\\u0F94-\\u0F97\\u0F99-\\u0F9C\\u0F9E-\\u0FA1\\u0FA3-\\u0FA6"
+ "\\u0FA8-\\u0FAB\\u0FAD-\\u0FB8\\u0FBA-\\u0FBC\\u0FC6\\u1000-\\u1049\\u1050-\\u109D\\u10"
+ "C7\\u10CD\\u10D0-\\u10F0\\u10F7-\\u10FA\\u10FD-\\u10FF\\u1200-\\u1248\\u124A-\\u124D\\u"
+ "1250-\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0\\u12B2"
+ "-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310\\u1312-\\u1"
+ "315\\u1318-\\u135A\\u135D-\\u135F\\u1380-\\u138F\\u1780-\\u17A2\\u17A5-\\u17A7\\u17A9-"
+ "\\u17B3\\u17B6-\\u17CA\\u17D2\\u17D7\\u17DC\\u17E0-\\u17E9\\u1C80-\\u1C88\\u1E00-\\u1E9"
+ "9\\u1E9E\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1"
+ "F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F70\\u1F72\\u1F74\\u1F76\\u1F78\\u1F7A\\u1F"
+ "7C\\u1F80-\\u1FB4\\u1FB6-\\u1FBA\\u1FBC\\u1FC2-\\u1FC4\\u1FC6-\\u1FC8\\u1FCA\\u1FCC\\u1"
+ "FD0-\\u1FD2\\u1FD6-\\u1FDA\\u1FE0-\\u1FE2\\u1FE4-\\u1FEA\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-"
+ "\\u1FF8\\u1FFA\\u1FFC\\u2D27\\u2D2D\\u2D80-\\u2D96\\u2DA0-\\u2DA6\\u2DA8-\\u2DAE\\u2DB0"
+ "-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6\\u2DD8-\\u2DDE\\u3"
+ "005-\\u3007\\u3041-\\u3096\\u3099\\u309A\\u309D\\u309E\\u30A1-\\u30FA\\u30FC-\\u30FE\\u"
+ "3105-\\u312D\\u31A0-\\u31BA\\u3400-\\u4DB5\\u4E00-\\u9FD5\\uA660\\uA661\\uA674-\\uA67B"
+ "\\uA67F\\uA69F\\uA717-\\uA71F\\uA788\\uA78D\\uA78E\\uA790-\\uA793\\uA7A0-\\uA7AA\\uA7AE"
+ "\\uA7FA\\uA9E7-\\uA9FE\\uAA60-\\uAA76\\uAA7A-\\uAA7F\\uAB01-\\uAB06\\uAB09-\\uAB0E\\uAB"
+ "11-\\uAB16\\uAB20-\\uAB26\\uAB28-\\uAB2E\\uAC00-\\uD7A3\\uFA0E\\uFA0F\\uFA11\\uFA13\\uF"
+ "A14\\uFA1F\\uFA21\\uFA23\\uFA24\\uFA27-\\uFA29\\U00020000-\\U0002A6D6\\U0002A700-\\U0"
+ "002B734\\U0002B740-\\U0002B81D\\U0002B820-\\U0002CEA1]";
gRecommendedSet = new UnicodeSet(UnicodeString(recommendedPat, -1, US_INV), status);
gRecommendedSet->freeze();
@@ -137,11 +129,6 @@
ucln_i18n_registerCleanup(UCLN_I18N_SPOOF, uspoof_cleanup);
}
-static void U_CALLCONV initializeDefaultData(UErrorCode &status) {
- gDefaultSpoofData = SpoofData::getDefault(status);
- ucln_i18n_registerCleanup(UCLN_I18N_SPOOF, uspoof_cleanup);
-}
-
U_CFUNC void uspoof_internalInitStatics(UErrorCode *status) {
umtx_initOnce(gSpoofInitStaticsOnce, &initializeStatics, *status);
}
@@ -149,14 +136,10 @@
U_CAPI USpoofChecker * U_EXPORT2
uspoof_open(UErrorCode *status) {
umtx_initOnce(gSpoofInitStaticsOnce, &initializeStatics, *status);
- umtx_initOnce(gSpoofInitDefaultOnce, &initializeDefaultData, *status);
if (U_FAILURE(*status)) {
return NULL;
}
- SpoofImpl *si = new SpoofImpl(gDefaultSpoofData, *status);
- if (si) {
- gDefaultSpoofData->addReference();
- }
+ SpoofImpl *si = new SpoofImpl(*status);
if (U_SUCCESS(*status) && si == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
}
@@ -164,7 +147,7 @@
delete si;
si = NULL;
}
- return reinterpret_cast<USpoofChecker *>(si);
+ return si->asUSpoofChecker();
}
@@ -190,9 +173,9 @@
}
if (pActualLength != NULL) {
- *pActualLength = sd->fRawData->fLength;
+ *pActualLength = sd->size();
}
- return reinterpret_cast<USpoofChecker *>(si);
+ return si->asUSpoofChecker();
}
@@ -207,7 +190,7 @@
delete result;
result = NULL;
}
- return reinterpret_cast<USpoofChecker *>(result);
+ return result->asUSpoofChecker();
}
@@ -335,7 +318,23 @@
const UChar *id, int32_t length,
int32_t *position,
UErrorCode *status) {
-
+
+ // Backwards compatibility:
+ if (position != NULL) {
+ *position = 0;
+ }
+
+ // Delegate to uspoof_check2
+ return uspoof_check2(sc, id, length, NULL, status);
+}
+
+
+U_CAPI int32_t U_EXPORT2
+uspoof_check2(const USpoofChecker *sc,
+ const UChar* id, int32_t length,
+ USpoofCheckResult* checkResult,
+ UErrorCode *status) {
+
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
if (This == NULL) {
return 0;
@@ -345,7 +344,7 @@
return 0;
}
UnicodeString idStr((length == -1), id, length); // Aliasing constructor.
- int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status);
+ int32_t result = uspoof_check2UnicodeString(sc, idStr, checkResult, status);
return result;
}
@@ -356,11 +355,27 @@
int32_t *position,
UErrorCode *status) {
+ // Backwards compatibility:
+ if (position != NULL) {
+ *position = 0;
+ }
+
+ // Delegate to uspoof_check2
+ return uspoof_check2UTF8(sc, id, length, NULL, status);
+}
+
+
+U_CAPI int32_t U_EXPORT2
+uspoof_check2UTF8(const USpoofChecker *sc,
+ const char *id, int32_t length,
+ USpoofCheckResult* checkResult,
+ UErrorCode *status) {
+
if (U_FAILURE(*status)) {
return 0;
}
UnicodeString idStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : uprv_strlen(id)));
- int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status);
+ int32_t result = uspoof_check2UnicodeString(sc, idStr, checkResult, status);
return result;
}
@@ -414,7 +429,7 @@
if (U_FAILURE(*status)) {
return 0;
}
- //
+ //
// See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
// and for definitions of the types (single, whole, mixed-script) of confusables.
@@ -422,125 +437,95 @@
// If no tests relavant to this function have been specified, return an error.
// TODO: is this really the right thing to do? It's probably an error on the caller's part,
// but logically we would just return 0 (no error).
- if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE |
- USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) {
+ if ((This->fChecks & USPOOF_CONFUSABLE) == 0) {
*status = U_INVALID_STATE_ERROR;
return 0;
}
- int32_t flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE;
- int32_t result = 0;
- IdentifierInfo *identifierInfo = This->getIdentifierInfo(*status);
- if (U_FAILURE(*status)) {
+ // Compute the skeletons and check for confusability.
+ UnicodeString id1Skeleton;
+ uspoof_getSkeletonUnicodeString(sc, 0 /* deprecated */, id1, id1Skeleton, status);
+ UnicodeString id2Skeleton;
+ uspoof_getSkeletonUnicodeString(sc, 0 /* deprecated */, id2, id2Skeleton, status);
+ if (U_FAILURE(*status)) { return 0; }
+ if (id1Skeleton != id2Skeleton) {
return 0;
}
- identifierInfo->setIdentifier(id1, *status);
- int32_t id1ScriptCount = identifierInfo->getScriptCount();
- int32_t id1FirstScript = identifierInfo->getScripts()->nextSetBit(0);
- identifierInfo->setIdentifier(id2, *status);
- int32_t id2ScriptCount = identifierInfo->getScriptCount();
- int32_t id2FirstScript = identifierInfo->getScripts()->nextSetBit(0);
- This->releaseIdentifierInfo(identifierInfo);
- identifierInfo = NULL;
- if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
- UnicodeString id1Skeleton;
- UnicodeString id2Skeleton;
- if (id1ScriptCount <= 1 && id2ScriptCount <= 1 && id1FirstScript == id2FirstScript) {
- flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
- uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status);
- uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status);
- if (id1Skeleton == id2Skeleton) {
- result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
- }
+ // If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes
+ // of confusables according to UTS 39 section 4.
+ // Start by computing the resolved script sets of id1 and id2.
+ ScriptSet id1RSS;
+ This->getResolvedScriptSet(id1, id1RSS, *status);
+ ScriptSet id2RSS;
+ This->getResolvedScriptSet(id2, id2RSS, *status);
+
+ // Turn on all applicable flags
+ int32_t result = 0;
+ if (id1RSS.intersects(id2RSS)) {
+ result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
+ } else {
+ result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
+ if (!id1RSS.isEmpty() && !id2RSS.isEmpty()) {
+ result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
}
}
- if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
- // If the two inputs are single script confusable they cannot also be
- // mixed or whole script confusable, according to the UAX39 definitions.
- // So we can skip those tests.
- return result;
+ // Turn off flags that the user doesn't want
+ if ((This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) == 0) {
+ result &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;
}
-
- // Two identifiers are whole script confusable if each is of a single script
- // and they are mixed script confusable.
- UBool possiblyWholeScriptConfusables =
- id1ScriptCount <= 1 && id2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE);
-
- //
- // Mixed Script Check
- //
- if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) {
- // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us
- // the mixed script table skeleton, which is what we want.
- // The Any Case / Lower Case bit in the skelton flags was set at the top of the function.
- UnicodeString id1Skeleton;
- UnicodeString id2Skeleton;
- flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;
- uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status);
- uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status);
- if (id1Skeleton == id2Skeleton) {
- result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
- if (possiblyWholeScriptConfusables) {
- result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
- }
- }
+ if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) == 0) {
+ result &= ~USPOOF_MIXED_SCRIPT_CONFUSABLE;
+ }
+ if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) == 0) {
+ result &= ~USPOOF_WHOLE_SCRIPT_CONFUSABLE;
}
return result;
}
-
-
U_CAPI int32_t U_EXPORT2
uspoof_checkUnicodeString(const USpoofChecker *sc,
- const icu::UnicodeString &id,
+ const icu::UnicodeString &id,
int32_t *position,
UErrorCode *status) {
- const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
- if (This == NULL) {
- return 0;
+
+ // Backwards compatibility:
+ if (position != NULL) {
+ *position = 0;
}
+
+ // Delegate to uspoof_check2
+ return uspoof_check2UnicodeString(sc, id, NULL, status);
+}
+
+int32_t checkImpl(const SpoofImpl* This, const UnicodeString& id, CheckResult* checkResult, UErrorCode* status) {
+ U_ASSERT(This != NULL);
+ U_ASSERT(checkResult != NULL);
+ checkResult->clear();
int32_t result = 0;
- IdentifierInfo *identifierInfo = NULL;
- if ((This->fChecks) & (USPOOF_RESTRICTION_LEVEL | USPOOF_MIXED_NUMBERS)) {
- identifierInfo = This->getIdentifierInfo(*status);
- if (U_FAILURE(*status)) {
- goto cleanupAndReturn;
- }
- identifierInfo->setIdentifier(id, *status);
- identifierInfo->setIdentifierProfile(*This->fAllowedCharsSet);
- }
-
-
- if ((This->fChecks) & USPOOF_RESTRICTION_LEVEL) {
- URestrictionLevel idRestrictionLevel = identifierInfo->getRestrictionLevel(*status);
+ if (0 != (This->fChecks & USPOOF_RESTRICTION_LEVEL)) {
+ URestrictionLevel idRestrictionLevel = This->getRestrictionLevel(id, *status);
if (idRestrictionLevel > This->fRestrictionLevel) {
result |= USPOOF_RESTRICTION_LEVEL;
}
- if (This->fChecks & USPOOF_AUX_INFO) {
- result |= idRestrictionLevel;
- }
+ checkResult->fRestrictionLevel = idRestrictionLevel;
}
- if ((This->fChecks) & USPOOF_MIXED_NUMBERS) {
- const UnicodeSet *numerics = identifierInfo->getNumerics();
- if (numerics->size() > 1) {
+ if (0 != (This->fChecks & USPOOF_MIXED_NUMBERS)) {
+ UnicodeSet numerics;
+ This->getNumerics(id, numerics, *status);
+ if (numerics.size() > 1) {
result |= USPOOF_MIXED_NUMBERS;
}
-
- // TODO: ICU4J returns the UnicodeSet of the numerics found in the identifier.
- // We have no easy way to do the same in C.
- // if (checkResult != null) {
- // checkResult.numerics = numerics;
- // }
+ checkResult->fNumerics = numerics; // UnicodeSet::operator=
}
- if (This->fChecks & (USPOOF_CHAR_LIMIT)) {
+ if (0 != (This->fChecks & USPOOF_CHAR_LIMIT)) {
int32_t i;
UChar32 c;
int32_t length = id.length();
@@ -554,103 +539,74 @@
}
}
- if (This->fChecks &
- (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
- // These are the checks that need to be done on NFD input
+ if (0 != (This->fChecks & USPOOF_INVISIBLE)) {
+ // This check needs to be done on NFD input
UnicodeString nfdText;
gNfdNormalizer->normalize(id, nfdText, *status);
int32_t nfdLength = nfdText.length();
- if (This->fChecks & USPOOF_INVISIBLE) {
-
- // scan for more than one occurence of the same non-spacing mark
- // in a sequence of non-spacing marks.
- int32_t i;
- UChar32 c;
- UChar32 firstNonspacingMark = 0;
- UBool haveMultipleMarks = FALSE;
- UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence.
-
- for (i=0; i<nfdLength ;) {
- c = nfdText.char32At(i);
- i += U16_LENGTH(c);
- if (u_charType(c) != U_NON_SPACING_MARK) {
- firstNonspacingMark = 0;
- if (haveMultipleMarks) {
- marksSeenSoFar.clear();
- haveMultipleMarks = FALSE;
- }
- continue;
- }
- if (firstNonspacingMark == 0) {
- firstNonspacingMark = c;
- continue;
- }
- if (!haveMultipleMarks) {
- marksSeenSoFar.add(firstNonspacingMark);
- haveMultipleMarks = TRUE;
- }
- if (marksSeenSoFar.contains(c)) {
- // report the error, and stop scanning.
- // No need to find more than the first failure.
- result |= USPOOF_INVISIBLE;
- break;
- }
- marksSeenSoFar.add(c);
- }
- }
-
+ // scan for more than one occurence of the same non-spacing mark
+ // in a sequence of non-spacing marks.
+ int32_t i;
+ UChar32 c;
+ UChar32 firstNonspacingMark = 0;
+ UBool haveMultipleMarks = FALSE;
+ UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence.
- if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
- // The basic test is the same for both whole and mixed script confusables.
- // Compute the set of scripts that every input character has a confusable in.
- // For this computation an input character is always considered to be
- // confusable with itself in its own script.
- //
- // If the number of such scripts is two or more, and the input consisted of
- // characters all from a single script, we have a whole script confusable.
- // (The two scripts will be the original script and the one that is confusable)
- //
- // If the number of such scripts >= one, and the original input contained characters from
- // more than one script, we have a mixed script confusable. (We can transform
- // some of the characters, and end up with a visually similar string all in
- // one script.)
-
- if (identifierInfo == NULL) {
- identifierInfo = This->getIdentifierInfo(*status);
- if (U_FAILURE(*status)) {
- goto cleanupAndReturn;
+ for (i=0; i<nfdLength ;) {
+ c = nfdText.char32At(i);
+ i += U16_LENGTH(c);
+ if (u_charType(c) != U_NON_SPACING_MARK) {
+ firstNonspacingMark = 0;
+ if (haveMultipleMarks) {
+ marksSeenSoFar.clear();
+ haveMultipleMarks = FALSE;
}
- identifierInfo->setIdentifier(id, *status);
+ continue;
}
-
- int32_t scriptCount = identifierInfo->getScriptCount();
-
- ScriptSet scripts;
- This->wholeScriptCheck(nfdText, &scripts, *status);
- int32_t confusableScriptCount = scripts.countMembers();
- //printf("confusableScriptCount = %d\n", confusableScriptCount);
-
- if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
- confusableScriptCount >= 2 &&
- scriptCount == 1) {
- result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
+ if (firstNonspacingMark == 0) {
+ firstNonspacingMark = c;
+ continue;
}
-
- if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
- confusableScriptCount >= 1 &&
- scriptCount > 1) {
- result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
+ if (!haveMultipleMarks) {
+ marksSeenSoFar.add(firstNonspacingMark);
+ haveMultipleMarks = TRUE;
}
+ if (marksSeenSoFar.contains(c)) {
+ // report the error, and stop scanning.
+ // No need to find more than the first failure.
+ result |= USPOOF_INVISIBLE;
+ break;
+ }
+ marksSeenSoFar.add(c);
}
}
-cleanupAndReturn:
- This->releaseIdentifierInfo(identifierInfo);
- if (position != NULL) {
- *position = 0;
+ checkResult->fChecks = result;
+ return checkResult->toCombinedBitmask(This->fChecks);
+}
+
+U_CAPI int32_t U_EXPORT2
+uspoof_check2UnicodeString(const USpoofChecker *sc,
+ const icu::UnicodeString &id,
+ USpoofCheckResult* checkResult,
+ UErrorCode *status) {
+ const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
+ if (This == NULL) {
+ return FALSE;
}
- return result;
+
+ if (checkResult != NULL) {
+ CheckResult* ThisCheckResult = CheckResult::validateThis(checkResult, *status);
+ if (ThisCheckResult == NULL) {
+ return FALSE;
+ }
+ return checkImpl(This, id, ThisCheckResult, status);
+ } else {
+ // Stack-allocate the checkResult since this method doesn't return it
+ CheckResult stackCheckResult;
+ return checkImpl(This, id, &stackCheckResult, status);
+ }
}
@@ -681,7 +637,7 @@
U_I18N_API UnicodeString & U_EXPORT2
uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
- uint32_t type,
+ uint32_t /*type*/,
const UnicodeString &id,
UnicodeString &dest,
UErrorCode *status) {
@@ -690,21 +646,9 @@
return dest;
}
- int32_t tableMask = 0;
- switch (type) {
- case 0:
- tableMask = USPOOF_ML_TABLE_FLAG;
- break;
- case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
- tableMask = USPOOF_SL_TABLE_FLAG;
- break;
- case USPOOF_ANY_CASE:
- tableMask = USPOOF_MA_TABLE_FLAG;
- break;
- case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE:
- tableMask = USPOOF_SA_TABLE_FLAG;
- break;
- default:
+ // Check that at least one of the CONFUSABLE flags is turned on. If not,
+ // return an error.
+ if ((This->fChecks & USPOOF_CONFUSABLE) == 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return dest;
}
@@ -720,7 +664,7 @@
for (inputIndex=0; inputIndex < normalizedLen; ) {
UChar32 c = nfdId.char32At(inputIndex);
inputIndex += U16_LENGTH(c);
- This->confusableLookup(c, tableMask, skelStr);
+ This->fSpoofData->confusableLookup(c, skelStr);
}
gNfdNormalizer->normalize(skelStr, dest, *status);
@@ -764,13 +708,8 @@
U_ASSERT(U_FAILURE(*status));
return 0;
}
- int32_t dataSize = This->fSpoofData->fRawData->fLength;
- if (capacity < dataSize) {
- *status = U_BUFFER_OVERFLOW_ERROR;
- return dataSize;
- }
- uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize);
- return dataSize;
+
+ return This->fSpoofData->serialize(buf, capacity, *status);
}
U_CAPI const USet * U_EXPORT2
@@ -797,6 +736,48 @@
return gRecommendedSet;
}
+//------------------
+// CheckResult APIs
+//------------------
+
+U_CAPI USpoofCheckResult* U_EXPORT2
+uspoof_openCheckResult(UErrorCode *status) {
+ CheckResult* checkResult = new CheckResult();
+ if (checkResult == NULL) {
+ *status = U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
+ }
+ return checkResult->asUSpoofCheckResult();
+}
+
+U_CAPI void U_EXPORT2
+uspoof_closeCheckResult(USpoofCheckResult* checkResult) {
+ UErrorCode status = U_ZERO_ERROR;
+ CheckResult* This = CheckResult::validateThis(checkResult, status);
+ delete This;
+}
+
+U_CAPI int32_t U_EXPORT2
+uspoof_getCheckResultChecks(const USpoofCheckResult *checkResult, UErrorCode *status) {
+ const CheckResult* This = CheckResult::validateThis(checkResult, *status);
+ if (U_FAILURE(*status)) { return 0; }
+ return This->fChecks;
+}
+
+U_CAPI URestrictionLevel U_EXPORT2
+uspoof_getCheckResultRestrictionLevel(const USpoofCheckResult *checkResult, UErrorCode *status) {
+ const CheckResult* This = CheckResult::validateThis(checkResult, *status);
+ if (U_FAILURE(*status)) { return USPOOF_UNRESTRICTIVE; }
+ return This->fRestrictionLevel;
+}
+
+U_CAPI const USet* U_EXPORT2
+uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *status) {
+ const CheckResult* This = CheckResult::validateThis(checkResult, *status);
+ if (U_FAILURE(*status)) { return NULL; }
+ return This->fNumerics.toUSet();
+}
+
#endif // !UCONFIG_NO_NORMALIZATION
diff --git a/icu4c/source/i18n/uspoof_build.cpp b/icu4c/source/i18n/uspoof_build.cpp
index 7eff5ac..4892b9d 100644
--- a/icu4c/source/i18n/uspoof_build.cpp
+++ b/icu4c/source/i18n/uspoof_build.cpp
@@ -37,7 +37,6 @@
#include "uassert.h"
#include "uarrsort.h"
#include "uspoof_conf.h"
-#include "uspoof_wsconf.h"
#if !UCONFIG_NO_NORMALIZATION
@@ -50,7 +49,7 @@
U_CAPI USpoofChecker * U_EXPORT2
uspoof_openFromSource(const char *confusables, int32_t confusablesLen,
- const char *confusablesWholeScript, int32_t confusablesWholeScriptLen,
+ const char* /*confusablesWholeScript*/, int32_t /*confusablesWholeScriptLen*/,
int32_t *errorType, UParseError *pe, UErrorCode *status) {
uspoof_internalInitStatics(status);
if (U_FAILURE(*status)) {
@@ -76,7 +75,6 @@
// Compile the binary data from the source (text) format.
ConfusabledataBuilder::buildConfusableData(This, confusables, confusablesLen, errorType, pe, *status);
- buildWSConfusableData(This, confusablesWholeScript, confusablesWholeScriptLen, pe, *status);
if (U_FAILURE(*status)) {
delete This;
diff --git a/icu4c/source/i18n/uspoof_conf.cpp b/icu4c/source/i18n/uspoof_conf.cpp
index b438f0b..4f8b3b4 100644
--- a/icu4c/source/i18n/uspoof_conf.cpp
+++ b/icu4c/source/i18n/uspoof_conf.cpp
@@ -45,8 +45,7 @@
//
// The binary structures are described in uspoof_impl.h
//
-// 1. parse the data, building 4 hash tables, one each for the SL, SA, ML and MA
-// tables. Each maps from a UChar32 to a String.
+// 1. Parse the data, making a hash table mapping from a UChar32 to a String.
//
// 2. Sort all of the strings encountered by length, since they will need to
// be stored in that order in the final string table.
@@ -63,7 +62,7 @@
SPUString::SPUString(UnicodeString *s) {
fStr = s;
- fStrTableIndex = 0;
+ fCharOrStrTableIndex = 0;
}
@@ -145,15 +144,11 @@
ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) :
fSpoofImpl(spImpl),
fInput(NULL),
- fSLTable(NULL),
- fSATable(NULL),
- fMLTable(NULL),
- fMATable(NULL),
+ fTable(NULL),
fKeySet(NULL),
fKeyVec(NULL),
fValueVec(NULL),
fStringTable(NULL),
- fStringLengthsTable(NULL),
stringPool(NULL),
fParseLine(NULL),
fParseHexNum(NULL),
@@ -162,10 +157,7 @@
if (U_FAILURE(status)) {
return;
}
- fSLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
- fSATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
- fMLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
- fMATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
+ fTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
fKeySet = new UnicodeSet();
fKeyVec = new UVector(status);
fValueVec = new UVector(status);
@@ -177,14 +169,10 @@
uprv_free(fInput);
uregex_close(fParseLine);
uregex_close(fParseHexNum);
- uhash_close(fSLTable);
- uhash_close(fSATable);
- uhash_close(fMLTable);
- uhash_close(fMATable);
+ uhash_close(fTable);
delete fKeySet;
delete fKeyVec;
delete fStringTable;
- delete fStringLengthsTable;
delete fValueVec;
delete stringPool;
}
@@ -230,7 +218,7 @@
// any line. What was matched is determined by examining which capture groups have a match.
// Capture Group 1: the source char
// Capture Group 2: the replacement chars
- // Capture Group 3-6 the table type, SL, SA, ML, or MA
+ // Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated)
// Capture Group 7: A blank or comment only line.
// Capture Group 8: A syntactically invalid line. Anything that didn't match before.
// Example Line from the confusables.txt source file:
@@ -296,41 +284,12 @@
// This a little like a Java intern() - any duplicates will be eliminated.
SPUString *smapString = stringPool->addString(mapString, status);
- // Add the UChar32 -> string mapping to the appropriate table.
- UHashtable *table = uregex_start(fParseLine, 3, &status) >= 0 ? fSLTable :
- uregex_start(fParseLine, 4, &status) >= 0 ? fSATable :
- uregex_start(fParseLine, 5, &status) >= 0 ? fMLTable :
- uregex_start(fParseLine, 6, &status) >= 0 ? fMATable :
- NULL;
- if (U_SUCCESS(status) && table == NULL) {
- status = U_PARSE_ERROR;
- }
- if (U_FAILURE(status)) {
- return;
- }
-
+ // Add the UChar32 -> string mapping to the table.
// For Unicode 8, the SL, SA and ML tables have been discontinued.
// All input data from confusables.txt is tagged MA.
- // ICU spoof check functions should ignore the specified table and always
- // use this MA Data.
- // For now, implement by populating the MA data into all four tables, and
- // keep the multiple table implementation in place, in case it comes back
- // at some time in the future.
- // There is no run time size penalty to keeping the four table implementation -
- // the data is shared when it's the same betweeen tables.
- if (table != fMATable) {
- status = U_PARSE_ERROR;
- return;
- };
- // uhash_iput(table, keyChar, smapString, &status);
- uhash_iput(fSLTable, keyChar, smapString, &status);
- uhash_iput(fSATable, keyChar, smapString, &status);
- uhash_iput(fMLTable, keyChar, smapString, &status);
- uhash_iput(fMATable, keyChar, smapString, &status);
+ uhash_iput(fTable, keyChar, smapString, &status);
+ if (U_FAILURE(status)) { return; }
fKeySet->add(keyChar);
- if (U_FAILURE(status)) {
- return;
- }
}
// Input data is now all parsed and collected.
@@ -343,43 +302,24 @@
// Build up the string array, and record the index of each string therein
// in the (build time only) string pool.
// Strings of length one are not entered into the strings array.
- // At the same time, build up the string lengths table, which records the
- // position in the string table of the first string of each length >= 4.
// (Strings in the table are sorted by length)
stringPool->sort(status);
fStringTable = new UnicodeString();
- fStringLengthsTable = new UVector(status);
- int32_t previousStringLength = 0;
- int32_t previousStringIndex = 0;
int32_t poolSize = stringPool->size();
int32_t i;
for (i=0; i<poolSize; i++) {
SPUString *s = stringPool->getByIndex(i);
int32_t strLen = s->fStr->length();
int32_t strIndex = fStringTable->length();
- U_ASSERT(strLen >= previousStringLength);
if (strLen == 1) {
// strings of length one do not get an entry in the string table.
// Keep the single string character itself here, which is the same
// convention that is used in the final run-time string table index.
- s->fStrTableIndex = s->fStr->charAt(0);
+ s->fCharOrStrTableIndex = s->fStr->charAt(0);
} else {
- if ((strLen > previousStringLength) && (previousStringLength >= 4)) {
- fStringLengthsTable->addElement(previousStringIndex, status);
- fStringLengthsTable->addElement(previousStringLength, status);
- }
- s->fStrTableIndex = strIndex;
+ s->fCharOrStrTableIndex = strIndex;
fStringTable->append(*(s->fStr));
}
- previousStringLength = strLen;
- previousStringIndex = strIndex;
- }
- // Make the final entry to the string lengths table.
- // (it holds an entry for the _last_ string of each length, so adding the
- // final one doesn't happen in the main loop because no longer string was encountered.)
- if (previousStringLength >= 4) {
- fStringLengthsTable->addElement(previousStringIndex, status);
- fStringLengthsTable->addElement(previousStringLength, status);
}
// Construct the compile-time Key and Value tables
@@ -398,10 +338,15 @@
// code points requires a nested loop.
for (UChar32 keyChar=fKeySet->getRangeStart(range);
keyChar <= fKeySet->getRangeEnd(range); keyChar++) {
- addKeyEntry(keyChar, fSLTable, USPOOF_SL_TABLE_FLAG, status);
- addKeyEntry(keyChar, fSATable, USPOOF_SA_TABLE_FLAG, status);
- addKeyEntry(keyChar, fMLTable, USPOOF_ML_TABLE_FLAG, status);
- addKeyEntry(keyChar, fMATable, USPOOF_MA_TABLE_FLAG, status);
+ SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(fTable, keyChar));
+ U_ASSERT(targetMapping != NULL);
+
+ int32_t key = ConfusableDataUtils::codePointAndLengthToKey(keyChar,
+ targetMapping->fStr->length());
+ int32_t value = targetMapping->fCharOrStrTableIndex;
+
+ fKeyVec->addElement(key, status);
+ fValueVec->addElement(value, status);
}
}
@@ -437,14 +382,14 @@
return;
}
int i;
- int32_t previousKey = 0;
+ UChar32 previousCodePoint = 0;
for (i=0; i<numKeys; i++) {
int32_t key = fKeyVec->elementAti(i);
- (void)previousKey; // Suppress unused variable warning on gcc.
- U_ASSERT((key & 0x00ffffff) >= (previousKey & 0x00ffffff));
- U_ASSERT((key & 0xff000000) != 0);
+ UChar32 codePoint = ConfusableDataUtils::keyToCodePoint(key);
+ // strictly greater because there can be only one entry per code point
+ U_ASSERT(codePoint > previousCodePoint);
keys[i] = key;
- previousKey = key;
+ previousCodePoint = codePoint;
}
SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData;
rawData->fCFUKeys = (int32_t)((char *)keys - (char *)rawData);
@@ -486,143 +431,6 @@
rawData->fCFUStringTable = (int32_t)((char *)strings - (char *)rawData);
rawData->fCFUStringTableLen = stringsLength;
fSpoofImpl->fSpoofData->fCFUStrings = strings;
-
- // The String Lengths Table
- // While copying into the runtime array do some sanity checks on the values
- // Each complete entry contains two fields, an index and an offset.
- // Lengths should increase with each entry.
- // Offsets should be less than the size of the string table.
- int32_t lengthTableLength = fStringLengthsTable->size();
- uint16_t *stringLengths =
- static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(lengthTableLength*sizeof(uint16_t), status));
- if (U_FAILURE(status)) {
- return;
- }
- int32_t destIndex = 0;
- uint32_t previousLength = 0;
- for (i=0; i<lengthTableLength; i+=2) {
- uint32_t offset = static_cast<uint32_t>(fStringLengthsTable->elementAti(i));
- uint32_t length = static_cast<uint32_t>(fStringLengthsTable->elementAti(i+1));
- U_ASSERT(offset < stringsLength);
- U_ASSERT(length < 40);
- (void)previousLength; // Suppress unused variable warning on gcc.
- U_ASSERT(length > previousLength);
- stringLengths[destIndex++] = static_cast<uint16_t>(offset);
- stringLengths[destIndex++] = static_cast<uint16_t>(length);
- previousLength = length;
- }
- rawData = fSpoofImpl->fSpoofData->fRawData;
- rawData->fCFUStringLengths = (int32_t)((char *)stringLengths - (char *)rawData);
- // Note: StringLengthsSize in the raw data is the number of complete entries,
- // each consisting of a pair of 16 bit values, hence the divide by 2.
- rawData->fCFUStringLengthsSize = lengthTableLength / 2;
- fSpoofImpl->fSpoofData->fCFUStringLengths =
- reinterpret_cast<SpoofStringLengthsElement *>(stringLengths);
-}
-
-
-
-// addKeyEntry Construction of the confusable Key and Mapping Values tables.
-// This is an intermediate point in the building process.
-// We already have the mappings in the hash tables fSLTable, etc.
-// This function builds corresponding run-time style table entries into
-// fKeyVec and fValueVec
-
-void ConfusabledataBuilder::addKeyEntry(
- UChar32 keyChar, // The key character
- UHashtable *table, // The table, one of SATable, MATable, etc.
- int32_t tableFlag, // One of USPOOF_SA_TABLE_FLAG, etc.
- UErrorCode &status) {
-
- SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(table, keyChar));
- if (targetMapping == NULL) {
- // No mapping for this key character.
- // (This function is called for all four tables for each key char that
- // is seen anywhere, so this no entry cases are very much expected.)
- return;
- }
-
- // Check whether there is already an entry with the correct mapping.
- // If so, simply set the flag in the keyTable saying that the existing entry
- // applies to the table that we're doing now.
-
- UBool keyHasMultipleValues = FALSE;
- int32_t i;
- for (i=fKeyVec->size()-1; i>=0 ; i--) {
- int32_t key = fKeyVec->elementAti(i);
- if ((key & 0x0ffffff) != keyChar) {
- // We have now checked all existing key entries for this key char (if any)
- // without finding one with the same mapping.
- break;
- }
- UnicodeString mapping = getMapping(i);
- if (mapping == *(targetMapping->fStr)) {
- // The run time entry we are currently testing has the correct mapping.
- // Set the flag in it indicating that it applies to the new table also.
- key |= tableFlag;
- fKeyVec->setElementAt(key, i);
- return;
- }
- keyHasMultipleValues = TRUE;
- }
-
- // Need to add a new entry to the binary data being built for this mapping.
- // Includes adding entries to both the key table and the parallel values table.
-
- int32_t newKey = keyChar | tableFlag;
- if (keyHasMultipleValues) {
- newKey |= USPOOF_KEY_MULTIPLE_VALUES;
- }
- int32_t adjustedMappingLength = targetMapping->fStr->length() - 1;
- if (adjustedMappingLength>3) {
- adjustedMappingLength = 3;
- }
- newKey |= adjustedMappingLength << USPOOF_KEY_LENGTH_SHIFT;
-
- int32_t newData = targetMapping->fStrTableIndex;
-
- fKeyVec->addElement(newKey, status);
- fValueVec->addElement(newData, status);
-
- // If the preceding key entry is for the same key character (but with a different mapping)
- // set the multiple-values flag on it.
- if (keyHasMultipleValues) {
- int32_t previousKeyIndex = fKeyVec->size() - 2;
- int32_t previousKey = fKeyVec->elementAti(previousKeyIndex);
- previousKey |= USPOOF_KEY_MULTIPLE_VALUES;
- fKeyVec->setElementAt(previousKey, previousKeyIndex);
- }
-}
-
-
-
-UnicodeString ConfusabledataBuilder::getMapping(int32_t index) {
- int32_t key = fKeyVec->elementAti(index);
- int32_t value = fValueVec->elementAti(index);
- int32_t length = USPOOF_KEY_LENGTH_FIELD(key);
- int32_t lastIndexWithLen;
- switch (length) {
- case 0:
- return UnicodeString(static_cast<UChar>(value));
- case 1:
- case 2:
- return UnicodeString(*fStringTable, value, length+1);
- case 3:
- length = 0;
- int32_t i;
- for (i=0; i<fStringLengthsTable->size(); i+=2) {
- lastIndexWithLen = fStringLengthsTable->elementAti(i);
- if (value <= lastIndexWithLen) {
- length = fStringLengthsTable->elementAti(i+1);
- break;
- }
- }
- U_ASSERT(length>=3);
- return UnicodeString(*fStringTable, value, length);
- default:
- U_ASSERT(FALSE);
- }
- return UnicodeString();
}
#endif
diff --git a/icu4c/source/i18n/uspoof_conf.h b/icu4c/source/i18n/uspoof_conf.h
index 72162a6..bc5e4a9 100644
--- a/icu4c/source/i18n/uspoof_conf.h
+++ b/icu4c/source/i18n/uspoof_conf.h
@@ -38,9 +38,9 @@
struct SPUString : public UMemory {
UnicodeString *fStr; // The actual string.
- int32_t fStrTableIndex; // Index into the final runtime data for this string.
- // (or, for length 1, the single string char itself,
- // there being no string table entry for it.)
+ int32_t fCharOrStrTableIndex; // Index into the final runtime data for this
+ // string (or, for length 1, the single string char
+ // itself, there being no string table entry for it.)
SPUString(UnicodeString *s);
~SPUString();
};
@@ -88,10 +88,7 @@
private:
SpoofImpl *fSpoofImpl;
UChar *fInput;
- UHashtable *fSLTable;
- UHashtable *fSATable;
- UHashtable *fMLTable;
- UHashtable *fMATable;
+ UHashtable *fTable;
UnicodeSet *fKeySet; // A set of all keys (UChar32s) that go into the four mapping tables.
// The binary data is first assembled into the following four collections, then
@@ -99,7 +96,6 @@
UVector *fKeyVec;
UVector *fValueVec;
UnicodeString *fStringTable;
- UVector *fStringLengthsTable;
SPUStringPool *stringPool;
URegularExpression *fParseLine;
diff --git a/icu4c/source/i18n/uspoof_impl.cpp b/icu4c/source/i18n/uspoof_impl.cpp
index 04c6202..7002669 100644
--- a/icu4c/source/i18n/uspoof_impl.cpp
+++ b/icu4c/source/i18n/uspoof_impl.cpp
@@ -15,11 +15,11 @@
#include "utrie2.h"
#include "cmemory.h"
#include "cstring.h"
-#include "identifier_info.h"
#include "scriptset.h"
#include "umutex.h"
#include "udataswp.h"
#include "uassert.h"
+#include "ucln_in.h"
#include "uspoof_impl.h"
#if !UCONFIG_NO_NORMALIZATION
@@ -29,14 +29,38 @@
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
-SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
- fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(data), fAllowedCharsSet(NULL) ,
- fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
- if (U_FAILURE(status)) {
- return;
- }
+SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
+ construct(status);
+ fSpoofData = data;
+}
+
+SpoofImpl::SpoofImpl(UErrorCode& status) {
+ construct(status);
+
+ // TODO: Call this method where it is actually needed, instead of in the
+ // constructor, to allow for lazy data loading. See #12696.
+ fSpoofData = SpoofData::getDefault(status);
+}
+
+SpoofImpl::SpoofImpl() {
+ UErrorCode status = U_ZERO_ERROR;
+ construct(status);
+
+ // TODO: Call this method where it is actually needed, instead of in the
+ // constructor, to allow for lazy data loading. See #12696.
+ fSpoofData = SpoofData::getDefault(status);
+}
+
+void SpoofImpl::construct(UErrorCode& status) {
+ fMagic = USPOOF_MAGIC;
+ fChecks = USPOOF_ALL_CHECKS;
+ fSpoofData = NULL;
+ fAllowedCharsSet = NULL;
+ fAllowedLocales = NULL;
fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
+ if (U_FAILURE(status)) { return; }
+
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
allowedCharsSet->freeze();
fAllowedCharsSet = allowedCharsSet;
@@ -45,25 +69,13 @@
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
- fMagic = USPOOF_MAGIC;
-}
-
-
-SpoofImpl::SpoofImpl() :
- fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
- fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
- UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
- allowedCharsSet->freeze();
- fAllowedCharsSet = allowedCharsSet;
- fAllowedLocales = uprv_strdup("");
- fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
}
// Copy Constructor, used by the user level clone() function.
SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
- fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
+ fAllowedLocales(NULL) {
if (U_FAILURE(status)) {
return;
}
@@ -88,7 +100,11 @@
}
delete fAllowedCharsSet;
uprv_free((void *)fAllowedLocales);
- delete fCachedIdentifierInfo;
+}
+
+// Cast this instance as a USpoofChecker for the C API.
+USpoofChecker *SpoofImpl::asUSpoofChecker() {
+ return reinterpret_cast<USpoofChecker*>(this);
}
//
@@ -104,12 +120,11 @@
return NULL;
}
SpoofImpl *This = (SpoofImpl *)sc;
- if (This->fMagic != USPOOF_MAGIC ||
- This->fSpoofData == NULL) {
+ if (This->fMagic != USPOOF_MAGIC) {
status = U_INVALID_FORMAT_ERROR;
return NULL;
}
- if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) {
+ if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) {
return NULL;
}
return This;
@@ -121,148 +136,6 @@
}
-
-//--------------------------------------------------------------------------------------
-//
-// confusableLookup() This is the heart of the confusable skeleton generation
-// implementation.
-//
-// Given a source character, produce the corresponding
-// replacement character(s), appending them to the dest string.
-//
-//---------------------------------------------------------------------------------------
-int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const {
-
- // Binary search the spoof data key table for the inChar
- int32_t *low = fSpoofData->fCFUKeys;
- int32_t *mid = NULL;
- int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize;
- UChar32 midc;
- do {
- int32_t delta = ((int32_t)(limit-low))/2;
- mid = low + delta;
- midc = *mid & 0x1fffff;
- if (inChar == midc) {
- goto foundChar;
- } else if (inChar < midc) {
- limit = mid;
- } else {
- low = mid;
- }
- } while (low < limit-1);
- mid = low;
- midc = *mid & 0x1fffff;
- if (inChar != midc) {
- // Char not found. It maps to itself.
- int i = 0;
- dest.append(inChar);
- return i;
- }
- foundChar:
- int32_t keyFlags = *mid & 0xff000000;
- if ((keyFlags & tableMask) == 0) {
- // We found the right key char, but the entry doesn't pertain to the
- // table we need. See if there is an adjacent key that does
- if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) {
- int32_t *altMid;
- for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) {
- keyFlags = *altMid & 0xff000000;
- if (keyFlags & tableMask) {
- mid = altMid;
- goto foundKey;
- }
- }
- for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) {
- keyFlags = *altMid & 0xff000000;
- if (keyFlags & tableMask) {
- mid = altMid;
- goto foundKey;
- }
- }
- }
- // No key entry for this char & table.
- // The input char maps to itself.
- int i = 0;
- dest.append(inChar);
- return i;
- }
-
- foundKey:
- int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1;
- int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys);
-
- // Value is either a UChar (for strings of length 1) or
- // an index into the string table (for longer strings)
- uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
- if (stringLen == 1) {
- dest.append((UChar)value);
- return 1;
- }
-
- // String length of 4 from the above lookup is used for all strings of length >= 4.
- // For these, get the real length from the string lengths table,
- // which maps string table indexes to lengths.
- // All strings of the same length are stored contiguously in the string table.
- // 'value' from the lookup above is the starting index for the desired string.
-
- int32_t ix;
- if (stringLen == 4) {
- int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize;
- for (ix = 0; ix < stringLengthsLimit; ix++) {
- if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) {
- stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength;
- break;
- }
- }
- U_ASSERT(ix < stringLengthsLimit);
- }
-
- U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);
- UChar *src = &fSpoofData->fCFUStrings[value];
- dest.append(src, stringLen);
- return stringLen;
-}
-
-
-//---------------------------------------------------------------------------------------
-//
-// wholeScriptCheck()
-//
-// Input text is already normalized to NFD
-// Return the set of scripts, each of which can represent something that is
-// confusable with the input text. The script of the input text
-// is included; input consisting of characters from a single script will
-// always produce a result consisting of a set containing that script.
-//
-//---------------------------------------------------------------------------------------
-void SpoofImpl::wholeScriptCheck(
- const UnicodeString &text, ScriptSet *result, UErrorCode &status) const {
-
- UTrie2 *table =
- (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
- result->setAll();
- int32_t length = text.length();
- for (int32_t inputIdx=0; inputIdx < length;) {
- UChar32 c = text.char32At(inputIdx);
- inputIdx += U16_LENGTH(c);
- uint32_t index = utrie2_get32(table, c);
- if (index == 0) {
- // No confusables in another script for this char.
- // TODO: we should change the data to have sets with just the single script
- // bit for the script of this char. Gets rid of this special case.
- // Until then, grab the script from the char and intersect it with the set.
- UScriptCode cpScript = uscript_getScript(c, &status);
- U_ASSERT(cpScript > USCRIPT_INHERITED);
- result->intersect(cpScript, status);
- } else if (index == 1) {
- // Script == Common or Inherited. Nothing to do.
- } else {
- result->intersect(fSpoofData->fScriptSets[index]);
- }
- }
-}
-
-
void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
UnicodeSet allowedChars;
UnicodeSet *tmpSet = NULL;
@@ -374,6 +247,137 @@
}
}
+// Computes the augmented script set for a code point, according to UTS 39 section 5.1.
+void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
+ result.resetAll();
+ result.setScriptExtensions(codePoint, status);
+ if (U_FAILURE(status)) { return; }
+
+ // Section 5.1 step 1
+ if (result.test(USCRIPT_HAN, status)) {
+ result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
+ result.set(USCRIPT_JAPANESE, status);
+ result.set(USCRIPT_KOREAN, status);
+ }
+ if (result.test(USCRIPT_HIRAGANA, status)) {
+ result.set(USCRIPT_JAPANESE, status);
+ }
+ if (result.test(USCRIPT_KATAKANA, status)) {
+ result.set(USCRIPT_JAPANESE, status);
+ }
+ if (result.test(USCRIPT_HANGUL, status)) {
+ result.set(USCRIPT_KOREAN, status);
+ }
+ if (result.test(USCRIPT_BOPOMOFO, status)) {
+ result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
+ }
+
+ // Section 5.1 step 2
+ if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
+ result.setAll();
+ }
+}
+
+// Computes the resolved script set for a string, according to UTS 39 section 5.1.
+void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
+ getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
+}
+
+// Computes the resolved script set for a string, omitting characters having the specified script.
+// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
+void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
+ result.setAll();
+
+ ScriptSet temp;
+ UChar32 codePoint;
+ for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
+ codePoint = input.char32At(i);
+
+ // Compute the augmented script set for the character
+ getAugmentedScriptSet(codePoint, temp, status);
+ if (U_FAILURE(status)) { return; }
+
+ // Intersect the augmented script set with the resolved script set, but only if the character doesn't
+ // have the script specified in the function call
+ if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
+ result.intersect(temp);
+ }
+ }
+}
+
+// Computes the set of numerics for a string, according to UTS 39 section 5.3.
+void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
+ result.clear();
+
+ UChar32 codePoint;
+ for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
+ codePoint = input.char32At(i);
+
+ // Store a representative character for each kind of decimal digit
+ if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
+ // Store the zero character as a representative for comparison.
+ // Unicode guarantees it is codePoint - value
+ result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
+ }
+ }
+}
+
+// Computes the restriction level of a string, according to UTS 39 section 5.2.
+URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
+ // Section 5.2 step 1:
+ if (!fAllowedCharsSet->containsAll(input)) {
+ return USPOOF_UNRESTRICTIVE;
+ }
+
+ // Section 5.2 step 2
+ // Java use a static UnicodeSet for this test. In C++, avoid the static variable
+ // and just do a simple for loop.
+ UBool allASCII = TRUE;
+ for (int32_t i=0, length=input.length(); i<length; i++) {
+ if (input.charAt(i) > 0x7f) {
+ allASCII = FALSE;
+ break;
+ }
+ }
+ if (allASCII) {
+ return USPOOF_ASCII;
+ }
+
+ // Section 5.2 steps 3:
+ ScriptSet resolvedScriptSet;
+ getResolvedScriptSet(input, resolvedScriptSet, status);
+ if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
+
+ // Section 5.2 step 4:
+ if (!resolvedScriptSet.isEmpty()) {
+ return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
+ }
+
+ // Section 5.2 step 5:
+ ScriptSet resolvedNoLatn;
+ getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
+ if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
+
+ // Section 5.2 step 6:
+ if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
+ || resolvedNoLatn.test(USCRIPT_JAPANESE, status)
+ || resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
+ return USPOOF_HIGHLY_RESTRICTIVE;
+ }
+
+ // Section 5.2 step 7:
+ if (!resolvedNoLatn.isEmpty()
+ && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
+ && !resolvedNoLatn.test(USCRIPT_GREEK, status)
+ && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
+ return USPOOF_MODERATELY_RESTRICTIVE;
+ }
+
+ // Section 5.2 step 8:
+ return USPOOF_MINIMALLY_RESTRICTIVE;
+}
+
+
// Convert a text format hex number. Utility function used by builder code. Static.
// Input: UChar *string text. Output: a UChar32
@@ -406,55 +410,60 @@
return (UChar32)val;
}
-// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
-// Maintain a one-element cache, which is sufficient to avoid repeatedly
-// creating new ones unless we get multi-thread concurrency in spoof
-// check operations, which should be statistically uncommon.
-// These functions are used in place of new & delete of an IdentifierInfo.
-// They will recycle the IdentifierInfo when possible.
-// They are logically const, and used within const functions that must be thread safe.
-IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const {
- IdentifierInfo *returnIdInfo = NULL;
- if (U_FAILURE(status)) {
- return returnIdInfo;
- }
- SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
- {
- Mutex m;
- returnIdInfo = nonConstThis->fCachedIdentifierInfo;
- nonConstThis->fCachedIdentifierInfo = NULL;
- }
- if (returnIdInfo == NULL) {
- returnIdInfo = new IdentifierInfo(status);
- if (U_SUCCESS(status) && returnIdInfo == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- }
- if (U_FAILURE(status) && returnIdInfo != NULL) {
- delete returnIdInfo;
- returnIdInfo = NULL;
- }
- }
- return returnIdInfo;
+//-----------------------------------------
+//
+// class CheckResult Implementation
+//
+//-----------------------------------------
+
+CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC) {
+ clear();
}
+USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
+ return reinterpret_cast<USpoofCheckResult*>(this);
+}
-void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
- if (idInfo != NULL) {
- SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
- {
- Mutex m;
- if (nonConstThis->fCachedIdentifierInfo == NULL) {
- nonConstThis->fCachedIdentifierInfo = idInfo;
- idInfo = NULL;
- }
- }
- delete idInfo;
+//
+// Incoming parameter check on Status and the CheckResult object
+// received from the C API.
+//
+const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
+ if (U_FAILURE(status)) { return NULL; }
+ if (ptr == NULL) {
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ return NULL;
+ }
+ CheckResult *This = (CheckResult*) ptr;
+ if (This->fMagic != USPOOF_CHECK_MAGIC) {
+ status = U_INVALID_FORMAT_ERROR;
+ return NULL;
+ }
+ return This;
+}
+
+CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
+ return const_cast<CheckResult *>
+ (CheckResult::validateThis(const_cast<const USpoofCheckResult*>(ptr), status));
+}
+
+void CheckResult::clear() {
+ fChecks = 0;
+ fNumerics.clear();
+ fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
+}
+
+int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
+ if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
+ return fChecks | fRestrictionLevel;
+ } else {
+ return fChecks;
}
}
-
-
+CheckResult::~CheckResult() {
+}
//----------------------------------------------------------------------------------------------
//
@@ -463,12 +472,14 @@
//----------------------------------------------------------------------------------------------
-UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) {
+UBool SpoofData::validateDataVersion(UErrorCode &status) const {
if (U_FAILURE(status) ||
- rawData == NULL ||
- rawData->fMagic != USPOOF_MAGIC ||
- rawData->fFormatVersion[0] > 1 ||
- rawData->fFormatVersion[1] > 0) {
+ fRawData == NULL ||
+ fRawData->fMagic != USPOOF_MAGIC ||
+ fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
+ fRawData->fFormatVersion[1] != 0 ||
+ fRawData->fFormatVersion[2] != 0 ||
+ fRawData->fFormatVersion[3] != 0) {
status = U_INVALID_FORMAT_ERROR;
return FALSE;
}
@@ -487,7 +498,7 @@
pInfo->dataFormat[1] == 0x66 &&
pInfo->dataFormat[2] == 0x75 &&
pInfo->dataFormat[3] == 0x20 &&
- pInfo->formatVersion[0] == 1
+ pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
) {
UVersionInfo *version = static_cast<UVersionInfo *>(context);
if(version != NULL) {
@@ -499,32 +510,61 @@
}
}
+// Methods for the loading of the default confusables data file. The confusable
+// data is loaded only when it is needed.
//
-// SpoofData::getDefault() - return a wrapper around the spoof data that is
-// baked into the default ICU data.
+// SpoofData::getDefault() - Return the default confusables data, and call the
+// initOnce() if it is not available. Adds a reference
+// to the SpoofData that the caller is responsible for
+// decrementing when they are done with the data.
//
-// Called once, from the initOnce() function in uspoof_impl.cpp; the resulting
-// SpoofData is shared by all spoof checkers using the default data.
+// uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData
+// is shared by all spoof checkers using the default data.
//
-SpoofData *SpoofData::getDefault(UErrorCode &status) {
+// uspoof_cleanupDefaultData - Called during cleanup.
+//
+
+static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;
+static SpoofData* gDefaultSpoofData;
+
+static UBool U_CALLCONV
+uspoof_cleanupDefaultData(void) {
+ if (gDefaultSpoofData) {
+ // Will delete, assuming all user-level spoof checkers were closed.
+ gDefaultSpoofData->removeReference();
+ gDefaultSpoofData = NULL;
+ gSpoofInitDefaultOnce.reset();
+ }
+ return TRUE;
+}
+
+static void uspoof_loadDefaultData(UErrorCode& status) {
UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",
spoofDataIsAcceptable,
NULL, // context, would receive dataVersion if supplied.
&status);
+ if (U_FAILURE(status)) { return; }
+ gDefaultSpoofData = new SpoofData(udm, status);
if (U_FAILURE(status)) {
- return NULL;
+ delete gDefaultSpoofData;
+ return;
}
- SpoofData *This = new SpoofData(udm, status);
- if (U_FAILURE(status)) {
- delete This;
- return NULL;
- }
- if (This == NULL) {
+ if (gDefaultSpoofData == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
+ return;
}
- return This;
+ ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
}
+SpoofData* SpoofData::getDefault(UErrorCode& status) {
+ umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
+ if (U_FAILURE(status)) { return NULL; }
+ gDefaultSpoofData->addReference();
+ return gDefaultSpoofData;
+}
+
+
+
SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
{
reset();
@@ -535,7 +575,7 @@
// fRawData is non-const because it may be constructed by the data builder.
fRawData = reinterpret_cast<SpoofDataHeader *>(
const_cast<void *>(udata_getMemory(udm)));
- validateDataVersion(fRawData, status);
+ validateDataVersion(status);
initPtrs(status);
}
@@ -556,7 +596,7 @@
status = U_INVALID_FORMAT_ERROR;
return;
}
- validateDataVersion(fRawData, status);
+ validateDataVersion(status);
initPtrs(status);
}
@@ -584,7 +624,7 @@
uprv_memset(fRawData, 0, initialSize);
fRawData->fMagic = USPOOF_MAGIC;
- fRawData->fFormatVersion[0] = 1;
+ fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
fRawData->fFormatVersion[1] = 0;
fRawData->fFormatVersion[2] = 0;
fRawData->fFormatVersion[3] = 0;
@@ -602,11 +642,7 @@
fRefCount = 1;
fCFUKeys = NULL;
fCFUValues = NULL;
- fCFUStringLengths = NULL;
fCFUStrings = NULL;
- fAnyCaseTrie = NULL;
- fLowerCaseTrie = NULL;
- fScriptSets = NULL;
}
@@ -628,7 +664,6 @@
void SpoofData::initPtrs(UErrorCode &status) {
fCFUKeys = NULL;
fCFUValues = NULL;
- fCFUStringLengths = NULL;
fCFUStrings = NULL;
if (U_FAILURE(status)) {
return;
@@ -639,33 +674,13 @@
if (fRawData->fCFUStringIndex != 0) {
fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
}
- if (fRawData->fCFUStringLengths != 0) {
- fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths);
- }
if (fRawData->fCFUStringTable != 0) {
fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
}
-
- if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) {
- fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
- (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status);
- }
- if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) {
- fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
- (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status);
- }
-
- if (fRawData->fScriptSets != 0) {
- fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets);
- }
}
SpoofData::~SpoofData() {
- utrie2_close(fAnyCaseTrie);
- fAnyCaseTrie = NULL;
- utrie2_close(fLowerCaseTrie);
- fLowerCaseTrie = NULL;
if (fDataOwned) {
uprv_free(fRawData);
}
@@ -710,6 +725,78 @@
return (char *)fRawData + returnOffset;
}
+int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
+ int32_t dataSize = fRawData->fLength;
+ if (capacity < dataSize) {
+ status = U_BUFFER_OVERFLOW_ERROR;
+ return dataSize;
+ }
+ uprv_memcpy(buf, fRawData, dataSize);
+ return dataSize;
+}
+
+int32_t SpoofData::size() const {
+ return fRawData->fLength;
+}
+
+//-------------------------------
+//
+// Front-end APIs for SpoofData
+//
+//-------------------------------
+
+int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
+ // Perform a binary search.
+ // [lo, hi), i.e lo is inclusive, hi is exclusive.
+ // The result after the loop will be in lo.
+ int32_t lo = 0;
+ int32_t hi = length();
+ do {
+ int32_t mid = (lo + hi) / 2;
+ if (codePointAt(mid) > inChar) {
+ hi = mid;
+ } else if (codePointAt(mid) < inChar) {
+ lo = mid;
+ } else {
+ // Found result. Break early.
+ lo = mid;
+ break;
+ }
+ } while (hi - lo > 1);
+
+ // Did we find an entry? If not, the char maps to itself.
+ if (codePointAt(lo) != inChar) {
+ dest.append(inChar);
+ return 1;
+ }
+
+ // Add the element to the string builder and return.
+ return appendValueTo(lo, dest);
+}
+
+int32_t SpoofData::length() const {
+ return fRawData->fCFUKeysSize;
+}
+
+UChar32 SpoofData::codePointAt(int32_t index) const {
+ return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
+}
+
+int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
+ int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);
+
+ // Value is either a char (for strings of length 1) or
+ // an index into the string table (for longer strings)
+ uint16_t value = fCFUValues[index];
+ if (stringLength == 1) {
+ dest.append((UChar)value);
+ } else {
+ dest.append(fCFUStrings + value, stringLength);
+ }
+
+ return stringLength;
+}
+
U_NAMESPACE_END
@@ -741,7 +828,10 @@
pInfo->dataFormat[1]==0x66 &&
pInfo->dataFormat[2]==0x75 &&
pInfo->dataFormat[3]==0x20 &&
- pInfo->formatVersion[0]==1 )) {
+ pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
+ pInfo->formatVersion[1]==0 &&
+ pInfo->formatVersion[2]==0 &&
+ pInfo->formatVersion[3]==0 )) {
udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
"(format version %02x %02x %02x %02x) is not recognized\n",
pInfo->dataFormat[0], pInfo->dataFormat[1],
@@ -830,26 +920,6 @@
sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
- // String Lengths Section
- sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths);
- sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4;
- ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
-
- // Any Case Trie
- sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie);
- sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength);
- utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
-
- // Lower Case Trie
- sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie);
- sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength);
- utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
-
- // Script Sets. The data is an array of int32_t
- sectionStart = ds->readUInt32(spoofDH->fScriptSets);
- sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet);
- ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
-
// And, last, swap the header itself.
// int32_t fMagic // swap this
// uint8_t fFormatVersion[4] // Do not swap this, just copy
diff --git a/icu4c/source/i18n/uspoof_impl.h b/icu4c/source/i18n/uspoof_impl.h
index f8c3fe2..aa95dbc 100644
--- a/icu4c/source/i18n/uspoof_impl.h
+++ b/icu4c/source/i18n/uspoof_impl.h
@@ -15,6 +15,7 @@
#ifndef USPOOFIM_H
#define USPOOFIM_H
+#include "uassert.h"
#include "unicode/utypes.h"
#include "unicode/uspoof.h"
#include "unicode/uscript.h"
@@ -39,11 +40,13 @@
// Magic number for sanity checking spoof data.
#define USPOOF_MAGIC 0x3845fdef
-class IdentifierInfo;
+// Magic number for sanity checking spoof checkers.
+#define USPOOF_CHECK_MAGIC 0x2734ecde
+
class ScriptSet;
class SpoofData;
struct SpoofDataHeader;
-struct SpoofStringLengthsElement;
+class ConfusableDataUtils;
/**
* Class SpoofImpl corresponds directly to the plain C API opaque type
@@ -51,25 +54,20 @@
*/
class SpoofImpl : public UObject {
public:
- SpoofImpl(SpoofData *data, UErrorCode &status);
- SpoofImpl();
- virtual ~SpoofImpl();
+ SpoofImpl(SpoofData *data, UErrorCode& status);
+ SpoofImpl(UErrorCode& status);
+ SpoofImpl();
+ void construct(UErrorCode& status);
+ virtual ~SpoofImpl();
/** Copy constructor, used by the user level uspoof_clone() function.
*/
SpoofImpl(const SpoofImpl &src, UErrorCode &status);
+ USpoofChecker *asUSpoofChecker();
static SpoofImpl *validateThis(USpoofChecker *sc, UErrorCode &status);
static const SpoofImpl *validateThis(const USpoofChecker *sc, UErrorCode &status);
- /** Get the confusable skeleton transform for a single code point.
- * The result is a string with a length between 1 and 18.
- * @param tableMask bit flag specifying which confusable table to use.
- * One of USPOOF_SL_TABLE_FLAG, USPOOF_MA_TABLE_FLAG, etc.
- * @return The length in UTF-16 code units of the substition string.
- */
- int32_t confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &destBuf) const;
-
/** Set and Get AllowedLocales, implementations of the corresponding API */
void setAllowedLocales(const char *localesList, UErrorCode &status);
const char * getAllowedLocales(UErrorCode &status);
@@ -78,26 +76,19 @@
// the specified locale. Part of the implementation of setAllowedLocales.
void addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status);
+ // Functions implementing the features of UTS 39 section 5.
+ static void getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status);
+ void getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const;
+ void getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const;
+ void getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& status) const;
+ URestrictionLevel getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const;
/** parse a hex number. Untility used by the builders. */
static UChar32 ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status);
- // Implementation for Whole Script tests.
- // Return the test bit flag to be ORed into the eventual user return value
- // if a Spoof opportunity is detected.
- void wholeScriptCheck(
- const UnicodeString &text, ScriptSet *result, UErrorCode &status) const;
-
static UClassID U_EXPORT2 getStaticClassID(void);
virtual UClassID getDynamicClassID(void) const;
- // IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
- // Maintain a one-element cache, which is sufficient to avoid repeatedly
- // creating new ones unless we get multi-thread concurrency in spoof
- // check operations, which should be statistically uncommon.
- IdentifierInfo *getIdentifierInfo(UErrorCode &status) const;
- void releaseIdentifierInfo(IdentifierInfo *idInfo) const;
-
//
// Data Members
//
@@ -108,14 +99,36 @@
SpoofData *fSpoofData;
const UnicodeSet *fAllowedCharsSet; // The UnicodeSet of allowed characters.
- // for this Spoof Checker. Defaults to all chars.
+ // for this Spoof Checker. Defaults to all chars.
const char *fAllowedLocales; // The list of allowed locales.
URestrictionLevel fRestrictionLevel; // The maximum restriction level for an acceptable identifier.
-
- IdentifierInfo *fCachedIdentifierInfo; // Do not use directly. See getIdentifierInfo().:w
};
+/**
+ * Class CheckResult corresponds directly to the plain C API opaque type
+ * USpoofCheckResult. One can be cast to the other.
+ */
+class CheckResult : public UObject {
+public:
+ CheckResult();
+ virtual ~CheckResult();
+
+ USpoofCheckResult *asUSpoofCheckResult();
+ static CheckResult *validateThis(USpoofCheckResult *ptr, UErrorCode &status);
+ static const CheckResult *validateThis(const USpoofCheckResult *ptr, UErrorCode &status);
+
+ void clear();
+
+ // Used to convert this CheckResult to the older int32_t return value API
+ int32_t toCombinedBitmask(int32_t expectedChecks);
+
+ // Data Members (all stack-allocated)
+ int32_t fMagic; // Internal sanity check.
+ int32_t fChecks; // Bit vector of checks that were failed.
+ UnicodeSet fNumerics; // Set of numerics found in the string.
+ URestrictionLevel fRestrictionLevel; // The restriction level of the string.
+};
//
@@ -127,14 +140,7 @@
//
// The keys are stored as a sorted array of 32 bit ints.
// bits 0-23 a code point value
-// bits 24-31 flags
-// 24: 1 if entry applies to SL table
-// 25: 1 if entry applies to SA table
-// 26: 1 if entry applies to ML table
-// 27: 1 if entry applies to MA table
-// 28: 1 if there are multiple entries for this code point.
-// 29-30: length of value string, in UChars.
-// values are (1, 2, 3, other)
+// bits 24-31 length of value string, in UChars (between 1 and 256 UChars).
// The key table is sorted in ascending code point order. (not on the
// 32 bit int value, the flag bits do not participate in the sorting.)
//
@@ -154,33 +160,25 @@
//
// There is no nul character or other mark between adjacent strings.
//
-// String Lengths table
-// The length of strings from 1 to 3 is flagged in the key table.
-// For strings of length 4 or longer, the string length table provides a
-// mapping between an index into the string table and the corresponding length.
-// Strings of these lengths are rare, so lookup time is not an issue.
-// Each entry consists of
-// uint16_t index of the _last_ string with this length
-// uint16_t the length
-//
-// Flag bits in the Key entries
-#define USPOOF_SL_TABLE_FLAG (1<<24)
-#define USPOOF_SA_TABLE_FLAG (1<<25)
-#define USPOOF_ML_TABLE_FLAG (1<<26)
-#define USPOOF_MA_TABLE_FLAG (1<<27)
-#define USPOOF_KEY_MULTIPLE_VALUES (1<<28)
-#define USPOOF_KEY_LENGTH_SHIFT 29
-#define USPOOF_KEY_LENGTH_FIELD(x) (((x)>>29) & 3)
-
-
-struct SpoofStringLengthsElement {
- uint16_t fLastString; // index in string table of last string with this length
- uint16_t fStrLength; // Length of strings
+// Internal functions for manipulating confusable data table keys
+#define USPOOF_CONFUSABLE_DATA_FORMAT_VERSION 2 // version for ICU 58
+class ConfusableDataUtils {
+public:
+ inline static UChar32 keyToCodePoint(int32_t key) {
+ return key & 0x00ffffff;
+ }
+ inline static int32_t keyToLength(int32_t key) {
+ return ((key & 0xff000000) >> 24) + 1;
+ }
+ inline static int32_t codePointAndLengthToKey(UChar32 codePoint, int32_t length) {
+ U_ASSERT((codePoint & 0x00ffffff) == codePoint);
+ U_ASSERT(length <= 256);
+ return codePoint | ((length - 1) << 24);
+ }
};
-
//-------------------------------------------------------------------------------------
//
// SpoofData
@@ -197,7 +195,9 @@
//---------------------------------------------------------------------------------------
class SpoofData: public UMemory {
public:
- static SpoofData *getDefault(UErrorCode &status); // Load standard ICU spoof data.
+ static SpoofData* getDefault(UErrorCode &status); // Get standard ICU spoof data.
+ static void releaseDefault(); // Cleanup reference to default spoof data.
+
SpoofData(UErrorCode &status); // Create new spoof data wrapper.
// Only used when building new data from rules.
@@ -212,7 +212,8 @@
// Check raw Spoof Data Version compatibility.
// Return TRUE it looks good.
- static UBool validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status);
+ UBool validateDataVersion(UErrorCode &status) const;
+
~SpoofData(); // Destructor not normally used.
// Use removeReference() instead.
// Reference Counting functions.
@@ -222,6 +223,35 @@
SpoofData *addReference();
void removeReference();
+ // Reset all fields to an initial state.
+ // Called from the top of all constructors.
+ void reset();
+
+ // Copy this instance's raw data buffer to the specified address.
+ int32_t serialize(void *buf, int32_t capacity, UErrorCode &status) const;
+
+ // Get the total number of bytes of data backed by this SpoofData.
+ // Not to be confused with length, which returns the number of confusable entries.
+ int32_t size() const;
+
+ // Get the confusable skeleton transform for a single code point.
+ // The result is a string with a length between 1 and 18 as of Unicode 9.
+ // This is the main public endpoint for this class.
+ // @return The length in UTF-16 code units of the substition string.
+ int32_t confusableLookup(UChar32 inChar, UnicodeString &dest) const;
+
+ // Get the number of confusable entries in this SpoofData.
+ int32_t length() const;
+
+ // Get the code point (key) at the specified index.
+ UChar32 codePointAt(int32_t index) const;
+
+ // Get the confusable skeleton (value) at the specified index.
+ // Append it to the specified UnicodeString&.
+ // @return The length in UTF-16 code units of the skeleton string.
+ int32_t appendValueTo(int32_t index, UnicodeString& dest) const;
+
+ private:
// Reserve space in the raw data. For use by builder when putting together a
// new set of data. Init the new storage to zero, to prevent inconsistent
// results if it is not all otherwise set by the requester.
@@ -232,10 +262,6 @@
// initialize the pointers from this object to the raw data.
void initPtrs(UErrorCode &status);
- // Reset all fields to an initial state.
- // Called from the top of all constructors.
- void reset();
-
SpoofDataHeader *fRawData; // Ptr to the raw memory-mapped data
UBool fDataOwned; // True if the raw data is owned, and needs
// to be deleted when refcount goes to zero.
@@ -249,15 +275,10 @@
// Confusable data
int32_t *fCFUKeys;
uint16_t *fCFUValues;
- SpoofStringLengthsElement *fCFUStringLengths;
UChar *fCFUStrings;
- // Whole Script Confusable Data
- UTrie2 *fAnyCaseTrie;
- UTrie2 *fLowerCaseTrie;
- ScriptSet *fScriptSets;
- };
-
+ friend class ConfusabledataBuilder;
+};
//---------------------------------------------------------------------------------------
//
@@ -286,49 +307,13 @@
int32_t fCFUStringTable; // byte offset of String table
int32_t fCFUStringTableLen; // length of string table (in 16 bit UChars)
- int32_t fCFUStringLengths; // byte offset to String Lengths table
- int32_t fCFUStringLengthsSize; // number of entries in lengths table. (2 x 16 bits each)
-
-
- // The following sections are for data from confusablesWholeScript.txt
-
- int32_t fAnyCaseTrie; // byte offset to the serialized Any Case Trie
- int32_t fAnyCaseTrieLength; // Length (bytes) of the serialized Any Case Trie
-
- int32_t fLowerCaseTrie; // byte offset to the serialized Lower Case Trie
- int32_t fLowerCaseTrieLength; // Length (bytes) of the serialized Lower Case Trie
-
- int32_t fScriptSets; // byte offset to array of ScriptSets
- int32_t fScriptSetsLength; // Number of ScriptSets (24 bytes each)
-
-
// The following sections are for data from xidmodifications.txt
-
-
+
int32_t unused[15]; // Padding, Room for Expansion
-
- };
+};
-
-//
-// Structure for the Whole Script Confusable Data
-// See Unicode UAX-39, Unicode Security Mechanisms, for a description of the
-// Whole Script confusable data
-//
-// The data provides mappings from code points to a set of scripts
-// that contain characters that might be confused with the code point.
-// There are two mappings, one for lower case only, and one for characters
-// of any case.
-//
-// The actual data consists of a utrie2 to map from a code point to an offset,
-// and an array of UScriptSets (essentially bit maps) that is indexed
-// by the offsets obtained from the Trie.
-//
-//
-
-
U_NAMESPACE_END
#endif /* __cplusplus */
diff --git a/icu4c/source/test/cintltst/spooftest.c b/icu4c/source/test/cintltst/spooftest.c
index 35a64df..6a82372 100644
--- a/icu4c/source/test/cintltst/spooftest.c
+++ b/icu4c/source/test/cintltst/spooftest.c
@@ -222,7 +222,7 @@
checkResults = uspoof_check(sc2, scMixed, -1, NULL, &status);
TEST_ASSERT_SUCCESS(status);
- TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
+ TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT, checkResults);
uspoof_close(sc2);
free(buf);
@@ -299,7 +299,7 @@
checkResults = uspoof_check(clone2, scMixed, -1, NULL, &status);
TEST_ASSERT_SUCCESS(status);
- TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
+ TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT, checkResults);
uspoof_close(clone2);
TEST_TEARDOWN;
@@ -318,7 +318,7 @@
result = uspoof_check(sc, scMixed, -1, NULL, &status);
TEST_ASSERT_SUCCESS(status);
- TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, result);
+ TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT, result);
TEST_TEARDOWN
@@ -428,7 +428,7 @@
checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
TEST_ASSERT_SUCCESS(status);
- TEST_ASSERT_EQ(USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);
+ TEST_ASSERT_EQ(0, checkResults);
TEST_TEARDOWN;
/*
@@ -436,7 +436,7 @@
*/
TEST_SETUP
char utf8buf[200];
- int32_t checkResults;
+ int32_t checkResults, checkResults2;
int32_t position;
u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodLatin, -1, &status);
@@ -457,13 +457,62 @@
TEST_ASSERT_SUCCESS(status);
position = 666;
checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
+ checkResults2 = uspoof_check(sc, scMixed, -1, NULL, &status);
TEST_ASSERT_SUCCESS(status);
- TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_SINGLE_SCRIPT , checkResults);
+ TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT , checkResults);
TEST_ASSERT_EQ(0, position);
+ TEST_ASSERT_EQ(checkResults , checkResults2);
TEST_TEARDOWN;
/*
+ * uspoof_check2 variants
+ */
+ TEST_SETUP
+ int32_t result1, result2;
+ char utf8buf[200];
+ uspoof_setChecks(sc, USPOOF_ALL_CHECKS | USPOOF_AUX_INFO, &status);
+ USpoofCheckResult* checkResult = uspoof_openCheckResult(&status);
+ TEST_ASSERT_SUCCESS(status);
+
+ const UChar* tests[] = { goodLatin, scMixed, scLatin,
+ goodCyrl, goodGreek, lll_Latin_a, lll_Latin_b, han_Hiragana };
+
+ for (int32_t i=0; i<sizeof(tests)/sizeof(UChar*); i++) {
+ const UChar* str = tests[i];
+
+ // Basic test
+ result1 = uspoof_check(sc, str, -1, NULL, &status);
+ result2 = uspoof_check2(sc, str, -1, NULL, &status);
+ TEST_ASSERT_SUCCESS(status);
+ TEST_ASSERT_EQ(result1, result2);
+
+ // With check result parameter
+ result1 = uspoof_check(sc, str, -1, NULL, &status);
+ result2 = uspoof_check2(sc, str, -1, checkResult, &status);
+ TEST_ASSERT_SUCCESS(status);
+ TEST_ASSERT_EQ(result1, result2);
+
+ // Checks from checkResult should be same as those from bitmask
+ TEST_ASSERT_EQ(result1 & USPOOF_ALL_CHECKS, uspoof_getCheckResultChecks(checkResult, &status));
+
+ // Restriction level from checkResult should be same as that from bitmask
+ URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult, &status);
+ TEST_ASSERT_EQ(result1 & restrictionLevel, restrictionLevel);
+
+ // UTF8 endpoint
+ u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodLatin, -1, &status);
+ TEST_ASSERT_SUCCESS(status);
+ result1 = uspoof_checkUTF8(sc, utf8buf, -1, NULL, &status);
+ result2 = uspoof_check2UTF8(sc, utf8buf, -1, NULL, &status);
+ TEST_ASSERT_SUCCESS(status);
+ TEST_ASSERT_EQ(result1, result2);
+ }
+
+ uspoof_closeCheckResult(checkResult);
+ TEST_TEARDOWN;
+
+ /*
* uspoof_areConfusable()
*/
TEST_SETUP
diff --git a/icu4c/source/test/depstest/dependencies.txt b/icu4c/source/test/depstest/dependencies.txt
index b9b0698..2ceab6e 100644
--- a/icu4c/source/test/depstest/dependencies.txt
+++ b/icu4c/source/test/depstest/dependencies.txt
@@ -803,8 +803,7 @@
uclean_i18n
group: spoof_detection
- uspoof.o uspoof_build.o uspoof_conf.o uspoof_impl.o uspoof_wsconf.o
- identifier_info.o scriptset.o
+ uspoof.o uspoof_build.o uspoof_conf.o uspoof_impl.o scriptset.o
deps
uniset_props regex unorm uscript
diff --git a/icu4c/source/test/intltest/itspoof.cpp b/icu4c/source/test/intltest/itspoof.cpp
index 9b267e7..835ef22 100644
--- a/icu4c/source/test/intltest/itspoof.cpp
+++ b/icu4c/source/test/intltest/itspoof.cpp
@@ -23,7 +23,6 @@
#include "unicode/uspoof.h"
#include "cstring.h"
-#include "identifier_info.h"
#include "scriptset.h"
#include "uhash.h"
@@ -58,11 +57,15 @@
USpoofChecker *sc; \
sc = uspoof_open(&status); \
TEST_ASSERT_SUCCESS(status); \
+ USpoofCheckResult *checkResult; \
+ checkResult = uspoof_openCheckResult(&status); \
+ TEST_ASSERT_SUCCESS(status); \
if (U_SUCCESS(status)){
#define TEST_TEARDOWN \
} \
TEST_ASSERT_SUCCESS(status); \
+ uspoof_closeCheckResult(checkResult); \
uspoof_close(sc); \
}
@@ -81,7 +84,6 @@
TESTCASE_AUTO(testInvisible);
TESTCASE_AUTO(testConfData);
TESTCASE_AUTO(testBug8654);
- TESTCASE_AUTO(testIdentifierInfo);
TESTCASE_AUTO(testScriptSet);
TESTCASE_AUTO(testRestrictionLevel);
TESTCASE_AUTO(testMixedNumbers);
@@ -105,6 +107,7 @@
UnicodeString s1("cxs");
UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape(); // Cyrillic "cxs"
int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status);
+ TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);
TEST_TEARDOWN;
@@ -223,8 +226,9 @@
"A long string that will overflow stack buffers. A long string that will overflow stack buffers. ");
UnicodeString s2("A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "
"A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. ");
- TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status));
+ int32_t result = uspoof_areConfusableUnicodeString(sc, s1, s2, &status);
TEST_ASSERT_SUCCESS(status);
+ TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, result);
TEST_TEARDOWN;
}
@@ -398,146 +402,6 @@
}
}
-// testIdentifierInfo. Note that IdentifierInfo is not public ICU API at this time
-void IntlTestSpoof::testIdentifierInfo() {
- UErrorCode status = U_ZERO_ERROR;
- ScriptSet bitset12; bitset12.set(USCRIPT_LATIN, status).set(USCRIPT_HANGUL, status);
- ScriptSet bitset2; bitset2.set(USCRIPT_HANGUL, status);
- TEST_ASSERT(bitset12.contains(bitset2));
- TEST_ASSERT(bitset12.contains(bitset12));
- TEST_ASSERT(!bitset2.contains(bitset12));
-
- ScriptSet arabSet; arabSet.set(USCRIPT_ARABIC, status);
- ScriptSet latinSet; latinSet.set(USCRIPT_LATIN, status);
- UElement arabEl; arabEl.pointer = &arabSet;
- UElement latinEl; latinEl.pointer = &latinSet;
- TEST_ASSERT(uhash_compareScriptSet(arabEl, latinEl) < 0);
- TEST_ASSERT(uhash_compareScriptSet(latinEl, arabEl) > 0);
-
- UnicodeString scriptString;
- bitset12.displayScripts(scriptString);
- TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang Latn") == scriptString);
-
- status = U_ZERO_ERROR;
- UHashtable *alternates = uhash_open(uhash_hashScriptSet ,uhash_compareScriptSet, NULL, &status);
- uhash_puti(alternates, &bitset12, 1, &status);
- uhash_puti(alternates, &bitset2, 1, &status);
- UnicodeString alternatesString;
- IdentifierInfo::displayAlternates(alternatesString, alternates, status);
- TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang; Hang Latn") == alternatesString);
- TEST_ASSERT_SUCCESS(status);
-
- status = U_ZERO_ERROR;
- ScriptSet tScriptSet;
- tScriptSet.parseScripts(scriptString, status);
- TEST_ASSERT_SUCCESS(status);
- TEST_ASSERT(bitset12 == tScriptSet);
- UnicodeString ss;
- ss.remove();
- uhash_close(alternates);
-
- struct Test {
- const char *fTestString;
- URestrictionLevel fRestrictionLevel;
- const char *fNumerics;
- const char *fScripts;
- const char *fAlternates;
- const char *fCommonAlternates;
- } tests[] = {
- {"\\u0061\\u2665", USPOOF_UNRESTRICTIVE, "[]", "Latn", "", ""},
- {"\\u0061\\u303C", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"},
- {"\\u0061\\u30FC\\u303C", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn", "Hira Kana", "Hira Kana"},
- {"\\u0061\\u30FC\\u303C\\u30A2", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn Kana", "", ""},
- {"\\u30A2\\u0061\\u30FC\\u303C", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn Kana", "", ""},
- {"\\u0061\\u0031\\u0661", USPOOF_UNRESTRICTIVE, "[\\u0030\\u0660]", "Latn", "Arab Thaa", "Arab Thaa"},
- {"\\u0061\\u0031\\u0661\\u06F1", USPOOF_UNRESTRICTIVE, "[\\u0030\\u0660\\u06F0]", "Latn Arab", "", ""},
- {"\\u0661\\u30FC\\u303C\\u0061\\u30A2\\u0031\\u0967\\u06F1", USPOOF_UNRESTRICTIVE,
- "[\\u0030\\u0660\\u06F0\\u0966]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"},
- {"\\u0061\\u30A2\\u30FC\\u303C\\u0031\\u0967\\u0661\\u06F1", USPOOF_UNRESTRICTIVE,
- "[\\u0030\\u0660\\u06F0\\u0966]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"}
- };
-
- int testNum;
- for (testNum = 0; testNum < UPRV_LENGTHOF(tests); testNum++) {
- char testNumStr[40];
- sprintf(testNumStr, "testNum = %d", testNum);
- Test &test = tests[testNum];
- status = U_ZERO_ERROR;
- UnicodeString testString(test.fTestString); // Note: may do charset conversion.
- testString = testString.unescape();
- IdentifierInfo idInfo(status);
- TEST_ASSERT_SUCCESS(status);
- UnicodeSet allowedChars;
- // Allowed Identifier Characters. In addition to the Recommended Set,
- // allow u303c, which has an interesting script extension of Hani Hira Kana.
- allowedChars.addAll(*uspoof_getRecommendedUnicodeSet(&status)).add(0x303C);
- idInfo.setIdentifierProfile(allowedChars);
- idInfo.setIdentifier(testString, status);
- TEST_ASSERT_MSG(*idInfo.getIdentifier() == testString, testNumStr);
-
- URestrictionLevel restrictionLevel = test.fRestrictionLevel;
- TEST_ASSERT_MSG(restrictionLevel == idInfo.getRestrictionLevel(status), testNumStr);
-
- status = U_ZERO_ERROR;
- UnicodeSet numerics(UnicodeString(test.fNumerics).unescape(), status);
- TEST_ASSERT_SUCCESS(status);
- TEST_ASSERT_MSG(numerics == *idInfo.getNumerics(), testNumStr);
-
- ScriptSet scripts;
- scripts.parseScripts(UnicodeString(test.fScripts), status);
- TEST_ASSERT_MSG(scripts == *idInfo.getScripts(), testNumStr);
-
- UnicodeString alternatesStr;
- IdentifierInfo::displayAlternates(alternatesStr, idInfo.getAlternates(), status);
- TEST_ASSERT_MSG(UnicodeString(test.fAlternates) == alternatesStr, testNumStr);
-
- ScriptSet commonAlternates;
- commonAlternates.parseScripts(UnicodeString(test.fCommonAlternates), status);
- TEST_ASSERT_MSG(commonAlternates == *idInfo.getCommonAmongAlternates(), testNumStr);
- }
-
- // Test of getScriptCount()
- // Script and or Script Extension for chars used in the tests
- // \\u3013 ; Bopo Hang Hani Hira Kana # So GETA MARK
- // \\uA838 ; Deva Gujr Guru Kthi Takr # Sc NORTH INDIC RUPEE MARK
- // \\u0951 ; Deva Latn # Mn DEVANAGARI STRESS SIGN UDATTA
- //
- // \\u0370 ; Greek # L GREEK CAPITAL LETTER HETA
- // \\u0481 ; Cyrillic # L& CYRILLIC SMALL LETTER KOPPA
- // \\u0904 ; Devanagari # Lo DEVANAGARI LETTER SHORT A
- // \\u3041 ; Hiragana # Lo HIRAGANA LETTER SMALL A
- // 1234 ; Common # ascii digits
- // \\u0300 ; Inherited # Mn COMBINING GRAVE ACCENT
-
- struct ScriptTest {
- const char *fTestString;
- int32_t fScriptCount;
- } scriptTests[] = {
- {"Hello", 1},
- {"Hello\\u0370", 2},
- {"1234", 0},
- {"Hello1234\\u0300", 1}, // Common and Inherited are ignored.
- {"\\u0030", 0},
- {"abc\\u0951", 1},
- {"abc\\u3013", 2},
- {"\\uA838\\u0951", 1}, // Triggers commonAmongAlternates path.
- {"\\u3013\\uA838", 2}
- };
-
- status = U_ZERO_ERROR;
- IdentifierInfo identifierInfo(status);
- for (testNum=0; testNum<UPRV_LENGTHOF(scriptTests); testNum++) {
- ScriptTest &test = scriptTests[testNum];
- char msgBuf[100];
- sprintf(msgBuf, "testNum = %d ", testNum);
- UnicodeString testString = UnicodeString(test.fTestString).unescape();
-
- status = U_ZERO_ERROR;
- identifierInfo.setIdentifier(testString, status);
- int32_t scriptCount = identifierInfo.getScriptCount();
- TEST_ASSERT_MSG(test.fScriptCount == scriptCount, msgBuf);
- }
-}
void IntlTestSpoof::testScriptSet() {
ScriptSet s1;
@@ -601,6 +465,14 @@
TEST_ASSERT(s2.countMembers() == 1);
s1.resetAll();
+ TEST_ASSERT(s1.isEmpty());
+ s1.set(USCRIPT_LATIN, status);
+ TEST_ASSERT(!s1.isEmpty());
+ s1.setAll();
+ TEST_ASSERT(!s1.isEmpty());
+ TEST_ASSERT_SUCCESS(status);
+
+ s1.resetAll();
s1.set(USCRIPT_AFAKA, status);
s1.set(USCRIPT_VAI, status);
s1.set(USCRIPT_INHERITED, status);
@@ -616,6 +488,39 @@
}
}
TEST_ASSERT_SUCCESS(status);
+
+ // Script extensions. Depends on data.
+ s1.resetAll();
+ s1.setScriptExtensions(0x67, status);
+ TEST_ASSERT(s1.countMembers() == 1);
+ TEST_ASSERT(s1.test(USCRIPT_LATIN, status));
+ TEST_ASSERT_SUCCESS(status);
+
+ s1.resetAll();
+ s1.setScriptExtensions(0x303C, status);
+ TEST_ASSERT(s1.countMembers() == 3);
+ TEST_ASSERT(s1.test(USCRIPT_HAN, status));
+ TEST_ASSERT(s1.test(USCRIPT_HIRAGANA, status));
+ TEST_ASSERT(s1.test(USCRIPT_KATAKANA, status));
+ TEST_ASSERT_SUCCESS(status);
+
+ // Additional tests
+ ScriptSet bitset12; bitset12.set(USCRIPT_LATIN, status).set(USCRIPT_HANGUL, status);
+ ScriptSet bitset2; bitset2.set(USCRIPT_HANGUL, status);
+ TEST_ASSERT(bitset12.contains(bitset2));
+ TEST_ASSERT(bitset12.contains(bitset12));
+ TEST_ASSERT(!bitset2.contains(bitset12));
+
+ ScriptSet arabSet; arabSet.set(USCRIPT_ARABIC, status);
+ ScriptSet latinSet; latinSet.set(USCRIPT_LATIN, status);
+ UElement arabEl; arabEl.pointer = &arabSet;
+ UElement latinEl; latinEl.pointer = &latinSet;
+ TEST_ASSERT(uhash_compareScriptSet(arabEl, latinEl) < 0);
+ TEST_ASSERT(uhash_compareScriptSet(latinEl, arabEl) > 0);
+
+ UnicodeString scriptString;
+ bitset12.displayScripts(scriptString);
+ TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang Latn") == scriptString);
}
@@ -629,35 +534,40 @@
{"\\u03B3", USPOOF_SINGLE_SCRIPT_RESTRICTIVE},
{"\\u0061\\u30A2\\u30FC", USPOOF_HIGHLY_RESTRICTIVE},
{"\\u0061\\u0904", USPOOF_MODERATELY_RESTRICTIVE},
- {"\\u0061\\u03B3", USPOOF_MINIMALLY_RESTRICTIVE}
+ {"\\u0061\\u03B3", USPOOF_MINIMALLY_RESTRICTIVE},
+ {"\\u0061\\u2665", USPOOF_UNRESTRICTIVE},
+ {"\\u0061\\u303C", USPOOF_HIGHLY_RESTRICTIVE},
+ {"\\u0061\\u30FC\\u303C", USPOOF_HIGHLY_RESTRICTIVE},
+ {"\\u0061\\u30FC\\u303C\\u30A2", USPOOF_HIGHLY_RESTRICTIVE},
+ {"\\u30A2\\u0061\\u30FC\\u303C", USPOOF_HIGHLY_RESTRICTIVE},
+ {"\\u0061\\u0031\\u0661", USPOOF_MODERATELY_RESTRICTIVE},
+ {"\\u0061\\u0031\\u0661\\u06F1", USPOOF_MODERATELY_RESTRICTIVE},
+ {"\\u0661\\u30FC\\u303C\\u0061\\u30A2\\u0031\\u0967\\u06F1", USPOOF_MINIMALLY_RESTRICTIVE},
+ {"\\u0061\\u30A2\\u30FC\\u303C\\u0031\\u0967\\u0661\\u06F1", USPOOF_MINIMALLY_RESTRICTIVE}
};
char msgBuffer[100];
-
URestrictionLevel restrictionLevels[] = { USPOOF_ASCII, USPOOF_SINGLE_SCRIPT_RESTRICTIVE,
- USPOOF_HIGHLY_RESTRICTIVE, USPOOF_MODERATELY_RESTRICTIVE, USPOOF_MINIMALLY_RESTRICTIVE,
- USPOOF_UNRESTRICTIVE};
-
+ USPOOF_HIGHLY_RESTRICTIVE, USPOOF_MODERATELY_RESTRICTIVE, USPOOF_MINIMALLY_RESTRICTIVE,
+ USPOOF_UNRESTRICTIVE};
+
UErrorCode status = U_ZERO_ERROR;
- IdentifierInfo idInfo(status);
- TEST_ASSERT_SUCCESS(status);
- idInfo.setIdentifierProfile(*uspoof_getRecommendedUnicodeSet(&status));
- TEST_ASSERT_SUCCESS(status);
+ UnicodeSet allowedChars;
+ // Allowed Identifier Characters. In addition to the Recommended Set,
+ // allow u303c, which has an interesting script extension of Hani Hira Kana.
+ allowedChars.addAll(*uspoof_getRecommendedUnicodeSet(&status)).add(0x303C);
+
for (int32_t testNum=0; testNum < UPRV_LENGTHOF(tests); testNum++) {
status = U_ZERO_ERROR;
const Test &test = tests[testNum];
UnicodeString testString = UnicodeString(test.fId).unescape();
URestrictionLevel expectedLevel = test.fExpectedRestrictionLevel;
- idInfo.setIdentifier(testString, status);
- sprintf(msgBuffer, "testNum = %d ", testNum);
- TEST_ASSERT_SUCCESS(status);
- TEST_ASSERT_MSG(expectedLevel == idInfo.getRestrictionLevel(status), msgBuffer);
for (int levelIndex=0; levelIndex<UPRV_LENGTHOF(restrictionLevels); levelIndex++) {
status = U_ZERO_ERROR;
URestrictionLevel levelSetInSpoofChecker = restrictionLevels[levelIndex];
USpoofChecker *sc = uspoof_open(&status);
- uspoof_setChecks(sc, USPOOF_RESTRICTION_LEVEL, &status);
- uspoof_setAllowedChars(sc, uspoof_getRecommendedSet(&status), &status);
+ uspoof_setAllowedChars(sc, allowedChars.toUSet(), &status);
uspoof_setRestrictionLevel(sc, levelSetInSpoofChecker);
+ uspoof_setChecks(sc, USPOOF_RESTRICTION_LEVEL, &status);
int32_t actualValue = uspoof_checkUnicodeString(sc, testString, NULL, &status);
// we want to fail if the text is (say) MODERATE and the testLevel is ASCII
@@ -665,9 +575,6 @@
if (expectedLevel > levelSetInSpoofChecker) {
expectedValue |= USPOOF_RESTRICTION_LEVEL;
}
- if (!uspoof_getRecommendedUnicodeSet(&status)->containsAll(testString)) {
- expectedValue |= USPOOF_CHAR_LIMIT;
- }
sprintf(msgBuffer, "testNum = %d, levelIndex = %d, expected = %#x, actual = %#x",
testNum, levelIndex, expectedValue, actualValue);
TEST_ASSERT_MSG(expectedValue == actualValue, msgBuffer);
@@ -675,9 +582,9 @@
// Run the same check again, with the Spoof Checker configured to return
// the actual restriction level.
- uspoof_setChecks(sc, USPOOF_AUX_INFO | USPOOF_RESTRICTION_LEVEL, &status);
- uspoof_setAllowedChars(sc, uspoof_getRecommendedSet(&status), &status);
+ uspoof_setAllowedChars(sc, allowedChars.toUSet(), &status);
uspoof_setRestrictionLevel(sc, levelSetInSpoofChecker);
+ uspoof_setChecks(sc, USPOOF_AUX_INFO | USPOOF_RESTRICTION_LEVEL, &status);
int32_t result = uspoof_checkUnicodeString(sc, testString, NULL, &status);
TEST_ASSERT_SUCCESS(status);
if (U_SUCCESS(status)) {
@@ -687,8 +594,8 @@
uspoof_close(sc);
}
}
-}
+}
void IntlTestSpoof::testMixedNumbers() {
struct Test {
@@ -698,10 +605,18 @@
{"1", "[0]"},
{"\\u0967", "[\\u0966]"},
{"1\\u0967", "[0\\u0966]"},
- {"\\u0661\\u06F1", "[\\u0660\\u06F0]"}
+ {"\\u0661\\u06F1", "[\\u0660\\u06F0]"},
+ {"\\u0061\\u2665", "[]"},
+ {"\\u0061\\u303C", "[]"},
+ {"\\u0061\\u30FC\\u303C", "[]"},
+ {"\\u0061\\u30FC\\u303C\\u30A2", "[]"},
+ {"\\u30A2\\u0061\\u30FC\\u303C", "[]"},
+ {"\\u0061\\u0031\\u0661", "[\\u0030\\u0660]"},
+ {"\\u0061\\u0031\\u0661\\u06F1", "[\\u0030\\u0660\\u06F0]"},
+ {"\\u0661\\u30FC\\u303C\\u0061\\u30A2\\u0031\\u0967\\u06F1", "[\\u0030\\u0660\\u06F0\\u0966]"},
+ {"\\u0061\\u30A2\\u30FC\\u303C\\u0031\\u0967\\u0661\\u06F1", "[\\u0030\\u0660\\u06F0\\u0966]"}
};
UErrorCode status = U_ZERO_ERROR;
- IdentifierInfo idInfo(status);
for (int32_t testNum=0; testNum < UPRV_LENGTHOF(tests); testNum++) {
char msgBuf[100];
sprintf(msgBuf, "testNum = %d ", testNum);
@@ -710,17 +625,16 @@
status = U_ZERO_ERROR;
UnicodeString testString = UnicodeString(test.fTestString).unescape();
UnicodeSet expectedSet(UnicodeString(test.fExpectedSet).unescape(), status);
- idInfo.setIdentifier(testString, status);
- TEST_ASSERT_SUCCESS(status);
- TEST_ASSERT_MSG(expectedSet == *idInfo.getNumerics(), msgBuf);
status = U_ZERO_ERROR;
- USpoofChecker *sc = uspoof_open(&status);
- uspoof_setChecks(sc, USPOOF_MIXED_NUMBERS, &status); // only check this
- int32_t result = uspoof_checkUnicodeString(sc, testString, NULL, &status);
- UBool mixedNumberFailure = ((result & USPOOF_MIXED_NUMBERS) != 0);
- TEST_ASSERT_MSG((expectedSet.size() > 1) == mixedNumberFailure, msgBuf);
- uspoof_close(sc);
+ TEST_SETUP
+ uspoof_setChecks(sc, USPOOF_MIXED_NUMBERS, &status); // only check this
+ uspoof_check2UnicodeString(sc, testString, checkResult, &status);
+ UBool mixedNumberFailure = ((uspoof_getCheckResultChecks(checkResult, &status) & USPOOF_MIXED_NUMBERS) != 0);
+ TEST_ASSERT_MSG((expectedSet.size() > 1) == mixedNumberFailure, msgBuf);
+ const UnicodeSet* actualSet = UnicodeSet::fromUSet(uspoof_getCheckResultNumerics(checkResult, &status));
+ TEST_ASSERT_MSG(expectedSet == *actualSet, msgBuf);
+ TEST_TEARDOWN
}
}
diff --git a/icu4c/source/test/intltest/itspoof.h b/icu4c/source/test/intltest/itspoof.h
index c6b64eb..fc54e89 100644
--- a/icu4c/source/test/intltest/itspoof.h
+++ b/icu4c/source/test/intltest/itspoof.h
@@ -38,8 +38,6 @@
void testBug8654();
- void testIdentifierInfo();
-
void testScriptSet();
void testRestrictionLevel();
diff --git a/icu4c/source/tools/gencfu/gencfu.cpp b/icu4c/source/tools/gencfu/gencfu.cpp
index 7178c2f..f1175a7 100644
--- a/icu4c/source/tools/gencfu/gencfu.cpp
+++ b/icu4c/source/tools/gencfu/gencfu.cpp
@@ -16,17 +16,20 @@
// derived from the Unicode Consortium data described in
// Unicode UAX 39.
//
-// Usage: gencfu [options] -r confusables-file.txt -w whole-script-confusables.txt -o output-file.cfu
+// Usage: gencfu [options] -r confusables-file.txt -o output-file.cfu
//
// options: -v verbose
// -? or -h help
//
// The input rule filew is are plain text files containing confusable character
// definitions in the input format defined by Unicode UAX39 for the files
-// confusables.txt and confusablesWholeScript.txt. This source (.txt) format
+// confusables.txt. This source (.txt) format
// is also accepted direaccepted by ICU spoof detedtors. The
// files must be encoded in utf-8 format, with or without a BOM.
//
+// The script used to compile confusablesWholeScript.txt into the CFU file
+// until the Unicode consortium deprecated it.
+//
//--------------------------------------------------------------------
#include "unicode/utypes.h"
@@ -53,7 +56,7 @@
UOPTION_HELP_QUESTION_MARK, /* 1 */
UOPTION_VERBOSE, /* 2 */
{ "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */
- { "wsrules", NULL, NULL, NULL, 'w', UOPT_REQUIRES_ARG, 0}, /* 4 */
+ { "wsrules", NULL, NULL, NULL, 'w', UOPT_REQUIRES_ARG, 0}, /* 4 */ // deprecated
{ "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 5 */
UOPTION_ICUDATADIR, /* 6 */
UOPTION_DESTDIR, /* 7 */
@@ -62,7 +65,7 @@
};
void usageAndDie(int retCode) {
- printf("Usage: %s [-v] [-options] -r confusablesRules.txt -w wholeScriptConfusables.txt -o output-file\n", progName);
+ printf("Usage: %s [-v] [-options] -r confusablesRules.txt -o output-file\n", progName);
printf("\tRead in Unicode confusable character definitions and write out the binary data\n"
"options:\n"
"\t-h or -? or --help this usage text\n"
@@ -133,7 +136,6 @@
int main(int argc, char **argv) {
UErrorCode status = U_ZERO_ERROR;
const char *confFileName;
- const char *confWSFileName;
const char *outFileName;
const char *outDir = NULL;
const char *copyright = NULL;
@@ -156,12 +158,11 @@
usageAndDie(0);
}
- if (!(options[3].doesOccur && options[4].doesOccur && options[5].doesOccur)) {
- fprintf(stderr, "confusables file, whole script confusables file and output file must all be specified.\n");
+ if (!(options[3].doesOccur && options[5].doesOccur)) {
+ fprintf(stderr, "confusables file and output file must all be specified.\n");
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
confFileName = options[3].value;
- confWSFileName = options[4].value;
outFileName = options[5].value;
if (options[6].doesOccur) {
@@ -220,13 +221,6 @@
exit(-1);
}
- int32_t wsConfusablesLen = 0;
- const char *wsConfsables = readFile(confWSFileName, &wsConfusablesLen);
- if (wsConfsables == NULL) {
- printf("gencfu: error reading file \"%s\"\n", confFileName);
- exit(-1);
- }
-
//
// Create the Spoof Detector from the source confusables files.
// This will compile the data.
@@ -236,13 +230,11 @@
parseError.offset = 0;
int32_t errType;
USpoofChecker *sc = uspoof_openFromSource(confusables, confusablesLen,
- wsConfsables, wsConfusablesLen,
+ NULL, 0,
&errType, &parseError, &status);
if (U_FAILURE(status)) {
- const char *errFile =
- (errType == USPOOF_WHOLE_SCRIPT_CONFUSABLE)? confWSFileName : confFileName;
fprintf(stderr, "gencfu: uspoof_openFromSource error \"%s\" at file %s, line %d, column %d\n",
- u_errorName(status), errFile, (int)parseError.line, (int)parseError.offset);
+ u_errorName(status), confFileName, (int)parseError.line, (int)parseError.offset);
exit(status);
};
@@ -297,7 +289,6 @@
uspoof_close(sc);
delete [] outData;
delete [] confusables;
- delete [] wsConfsables;
u_cleanup();
if (!quiet) {
printf("gencfu: tool completed successfully.\n");