| /* |
| *************************************************************************** |
| * Copyright (C) 2008-2011, International Business Machines Corporation |
| * and others. All Rights Reserved. |
| *************************************************************************** |
| * |
| * Unicode Spoof Detection |
| */ |
| package com.ibm.icu.text; |
| |
| import java.io.BufferedInputStream; |
| import java.io.ByteArrayInputStream; |
| import java.io.ByteArrayOutputStream; |
| import java.io.DataInputStream; |
| import java.io.DataOutputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.LineNumberReader; |
| import java.io.Reader; |
| import java.text.ParseException; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.Hashtable; |
| import java.util.LinkedHashSet; |
| import java.util.Set; |
| import java.util.Vector; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| import com.ibm.icu.impl.Trie2; |
| import com.ibm.icu.impl.Trie2Writable; |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.lang.UCharacterCategory; |
| import com.ibm.icu.lang.UProperty; |
| import com.ibm.icu.lang.UScript; |
| import com.ibm.icu.util.ULocale; |
| |
| /** |
| * |
| * <b>Unicode Security and Spoofing Detection.</b> |
| * |
| * <p>This class is intended to check strings, typically |
| * identifiers of some type, such as URLs, for the presence of |
| * characters that are likely to be visually confusing - |
| * for cases where the displayed form of an identifier may |
| * not be what it appears to be. |
| * |
| * <p>Unicode Technical Report #36, |
| * <a href="http://unicode.org/reports/tr36">http://unicode.org/reports/tr36</a> and |
| * Unicode Technical Standard #39, |
| * <a href="http://unicode.org/reports/tr39">http://unicode.org/reports/tr39</a> |
| * "Unicode security considerations", give more background on |
| * security and spoofing issues with Unicode identifiers. |
| * The tests and checks provided by this module implement the recommendations |
| * from these Unicode documents. |
| * |
| * <p>The tests available on identifiers fall into two general categories: |
| * <ul> |
| * <li> Single identifier tests. Check whether an identifier is |
| * potentially confusable with any other string, or is suspicious |
| * for other reasons. </li> |
| * <li> Two identifier tests. Check whether two specific identifiers are confusable. |
| * This does not consider whether either of strings is potentially |
| * confusable with any string other than the exact one specified. </li> |
| * </ul> |
| * |
| * <p>The steps to perform confusability testing are |
| * <ul> |
| * <li> Create a <code>SpoofChecker.Builder</code> </li> |
| * <li> Configure the Builder for the desired set of tests. The tests that will |
| * be performed are specified by a set of SpoofCheck flags. </li> |
| * <li> Build a <code>SpoofChecker</code> from the Builder. </li> |
| * <li> Perform the checks using the pre-configured <code>SpoofChecker</code>. The results indicate |
| * which (if any) of the selected tests have identified possible problems with the identifier. |
| * Results are reported as a set of SpoofCheck flags; this mirrors the form in which |
| * the set of tests to perform was originally specified to the SpoofChecker. </li> |
| * </ul> |
| * |
| * <p>A <code>SpoofChecker</code> instance may be used repeatedly to perform checks on any number |
| * of identifiers. |
| * |
| * <p>Thread Safety: The methods on SpoofChecker objects are thread safe. |
| * The test functions for checking a single identifier, or for testing |
| * whether two identifiers are potentially confusable, may called concurrently |
| * from multiple threads using the same SpoofChecker instance. |
| * |
| * |
| * <p>Descriptions of the available checks. |
| * |
| * <p>When testing whether pairs of identifiers are confusable, with <code>areConfusable()</code> |
| * the relevant tests are |
| * |
| * <ul> |
| * <li> <code>SINGLE_SCRIPT_CONFUSABLE</code>: All of the characters from the two identifiers are |
| * from a single script, and the two identifiers are visually confusable.</li> |
| * <li> <code>MIXED_SCRIPT_CONFUSABLE</code>: At least one of the identifiers contains characters |
| * from more than one script, and the two identifiers are visually confusable.</li> |
| * <li> <code>WHOLE_SCRIPT_CONFUSABLE</code>: Each of the two identifiers is of a single script, but |
| * the the two identifiers are from different scripts, and they are visually confusable.</li> |
| * </ul> |
| * |
| * <p>The safest approach is to enable all three of these checks as a group. |
| * |
| * <p><code>ANY_CASE</code> is a modifier for the above tests. If the identifiers being checked can |
| * be of mixed case and are used in a case-sensitive manner, this option should be specified. |
| * |
| * <p>If the identifiers being checked are used in a case-insensitive manner, and if they are |
| * displayed to users in lower-case form only, the <code>ANY_CASE</code> option should not be |
| * specified. Confusabality issues involving upper case letters will not be reported. |
| * |
| * <p>When performing tests on a single identifier, with the check() family of functions, |
| * the relevant tests are: |
| * |
| * <ul> |
| * <li><code>MIXED_SCRIPT_CONFUSABLE</code>: the identifier contains characters from multiple |
| * scripts, and there exists an identifier of a single script that is visually confusable.</li> |
| * <li><code>WHOLE_SCRIPT_CONFUSABLE</code>: the identifier consists of characters from a single |
| * script, and there exists a visually confusable identifier. |
| * The visually confusable identifier also consists of characters from a single script. |
| * but not the same script as the identifier being checked.</li> |
| * <li><code>ANY_CASE</code>: modifies the mixed script and whole script confusables tests. If |
| * specified, the checks will find confusable characters of any case. |
| * If this flag is not set, the test is performed assuming case folded identifiers.</li> |
| * <li><code>SINGLE_SCRIPT</code>: check that the identifier contains only characters from a |
| * single script. (Characters from the <em>common</em> and <em>inherited</em> scripts are ignored.) |
| * This is not a test for confusable identifiers</li> |
| * <li><code>INVISIBLE</code>: check an identifier for the presence of invisible characters, |
| * such as zero-width spaces, or character sequences that are |
| * likely not to display, such as multiple occurrences of the same |
| * non-spacing mark. This check does not test the input string as a whole |
| * for conformance to any particular syntax for identifiers.</li> |
| * <li><code>CHAR_LIMIT</code>: check that an identifier contains only characters from a specified set |
| * of acceptable characters. See <code>Builder.setAllowedChars()</code> and |
| * <code>Builder.setAllowedLocales()</code>.</li> |
| * </ul> |
| * |
| * <p>Note on Scripts: |
| * <blockquote>Characters from the Unicode Scripts "Common" and "Inherited" are ignored when considering |
| * the script of an identifier. Common characters include digits and symbols that |
| * are normally used with text from many different scripts. </blockquote> |
| * |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public class SpoofChecker { |
| |
| /** |
| * Constants for the kinds of checks that USpoofChecker can perform. These values are used both to select the set of |
| * checks that will be performed, and to report results from the check function. |
| * |
| */ |
| |
| /** |
| * Single script confusable test. When testing whether two identifiers are confusable, report that they are if both |
| * are from the same script and they are visually confusable. Note: this test is not applicable to a check of a |
| * single identifier. |
| * |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public static final int SINGLE_SCRIPT_CONFUSABLE = 1; |
| |
| /** |
| * Mixed script confusable test. |
| * |
| * When checking a single identifier, report a problem if the identifier contains multiple scripts, and is also |
| * confusable with some other identifier in a single script. |
| * |
| * When testing whether two identifiers are confusable, report that they are if the two IDs are visually confusable, |
| * and and at least one contains characters from more than one script. |
| * |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public static final int MIXED_SCRIPT_CONFUSABLE = 2; |
| |
| /** |
| * Whole script confusable test. |
| * |
| * When checking a single identifier, report a problem if The identifier is of a single script, and there exists a |
| * confusable identifier in another script. |
| * |
| * When testing whether two Identifiers are confusable, report that they are if each is of a single script, the |
| * scripts of the two identifiers are different, and the identifiers are visually confusable. |
| * |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public static final int WHOLE_SCRIPT_CONFUSABLE = 4; |
| |
| /** |
| * Any Case Modifier for confusable identifier tests. |
| * |
| * When specified, consider all characters, of any case, when looking for confusables. If ANY_CASE is not specified, |
| * identifiers being checked are assumed to have been case folded, and upper case conusable characters will not be |
| * checked. |
| * |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public static final int ANY_CASE = 8; |
| |
| /** |
| * Check that an identifer contains only characters from a single script (plus chars from the common and inherited |
| * scripts.) Applies to checks of a single identifier check only. |
| * |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public static final int SINGLE_SCRIPT = 16; |
| |
| /** |
| * Check an identifier for the presence of invisible characters, such as zero-width spaces, or character sequences |
| * that are likely not to display, such as multiple occurrences of the same non-spacing mark. This check does not |
| * test the input string as a whole for conformance to any particular syntax for identifiers. |
| * |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public static final int INVISIBLE = 32; |
| |
| /** |
| * Check that an identifier contains only characters from a specified set of acceptable characters. See |
| * Builder.setAllowedChars() and Builder.setAllowedLocales(). |
| * |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public static final int CHAR_LIMIT = 64; |
| |
| /** |
| * Enable all spoof checks. |
| * |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public static final int ALL_CHECKS = 0x7f; |
| |
| // Magic number for sanity checking spoof binary resource data. |
| static final int MAGIC = 0x3845fdef; |
| |
| /** |
| * private constructor: a SpoofChecker has to be built by the builder |
| */ |
| private SpoofChecker() { |
| } |
| |
| /** |
| * SpoofChecker Builder. To create a SpoofChecker, first instantiate a SpoofChecker.Builder, set the desired |
| * checking options on the builder, then call the build() function to create a SpoofChecker instance. |
| * |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public static class Builder { |
| int fMagic; // Internal sanity check. |
| int fChecks; // Bit vector of checks to perform. |
| SpoofData fSpoofData; |
| UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters. |
| // for this Spoof Checker. Defaults to all chars. |
| Set<ULocale> fAllowedLocales; // The list of allowed locales. |
| |
| /** |
| * Constructor: Create a default Unicode Spoof Checker Builder, configured to perform all checks except for |
| * LOCALE_LIMIT and CHAR_LIMIT. Note that additional checks may be added in the future, resulting in the changes |
| * to the default checking behavior. |
| * |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public Builder() { |
| fMagic = MAGIC; |
| fChecks = ALL_CHECKS; |
| fSpoofData = null; |
| fAllowedCharsSet = new UnicodeSet(0, 0x10ffff); |
| fAllowedLocales = new LinkedHashSet<ULocale>(); |
| } |
| |
| /** |
| * Constructor: Create a Spoof Checker Builder, and set the configuration from an existing SpoofChecker. |
| * |
| * @param src |
| * The existing checker. |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public Builder(SpoofChecker src) { |
| fMagic = src.fMagic; |
| fChecks = src.fChecks; |
| fSpoofData = null; |
| fAllowedCharsSet = src.fAllowedCharsSet.cloneAsThawed(); |
| fAllowedLocales = new LinkedHashSet<ULocale>(); |
| fAllowedLocales.addAll(src.fAllowedLocales); |
| } |
| |
| /** |
| * Create a SpoofChecker with current configuration. |
| * |
| * @return SpoofChecker |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public SpoofChecker build() { |
| if (fSpoofData == null) { // read binary file |
| try { |
| fSpoofData = SpoofData.getDefault(); |
| } catch (java.io.IOException e) { |
| return null; |
| } |
| } |
| if (!SpoofData.validateDataVersion(fSpoofData.fRawData)) { |
| return null; |
| } |
| SpoofChecker result = new SpoofChecker(); |
| result.fMagic = this.fMagic; |
| result.fChecks = this.fChecks; |
| result.fSpoofData = this.fSpoofData; |
| result.fAllowedCharsSet = (UnicodeSet) (this.fAllowedCharsSet.clone()); |
| result.fAllowedCharsSet.freeze(); |
| result.fAllowedLocales = this.fAllowedLocales; |
| return result; |
| } |
| |
| /** |
| * Specify the source form of the spoof data Spoof Checker. The Three inputs correspond to the Unicode data |
| * files confusables.txt and confusablesWholeScript.txt as described in Unicode UAX 39. The syntax of the source |
| * data is as described in UAX 39 for these files, and the content of these files is acceptable input. |
| * |
| * @param confusables |
| * the Reader of confusable characters definitions, as found in file confusables.txt from |
| * unicode.org. |
| * @param confusablesWholeScript |
| * the Reader of whole script confusables definitions, as found in the file |
| * xonfusablesWholeScript.txt from unicode.org. |
| * @throws ParseException |
| * To report syntax errors in the input. |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public Builder setData(Reader confusables, Reader confusablesWholeScript) throws ParseException, |
| java.io.IOException { |
| // Set up a shell of a spoof detector, with empty data. |
| fSpoofData = new SpoofData(); |
| ByteArrayOutputStream bos = new ByteArrayOutputStream(); |
| DataOutputStream os = new DataOutputStream(bos); |
| // Compile the binary data from the source (text) format. |
| ConfusabledataBuilder.buildConfusableData(fSpoofData, confusables); |
| WSConfusableDataBuilder.buildWSConfusableData(fSpoofData, os, confusablesWholeScript); |
| return this; |
| } |
| |
| /** |
| * Specify the set of checks that will be performed by the check functions of this Spoof Checker. |
| * |
| * @param checks |
| * The set of checks that this spoof checker will perform. The value is an 'or' of the desired |
| * checks. |
| * @return self |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public Builder setChecks(int checks) { |
| // Verify that the requested checks are all ones (bits) that |
| // are acceptable, known values. |
| if (0 != (checks & ~SpoofChecker.ALL_CHECKS)) { |
| throw new IllegalArgumentException("Bad Spoof Checks value."); |
| } |
| this.fChecks = (checks & SpoofChecker.ALL_CHECKS); |
| return this; |
| } |
| |
| /** |
| * Limit characters that are acceptable in identifiers being checked to those normally used with the languages |
| * associated with the specified locales. Any previously specified list of locales is replaced by the new |
| * settings. |
| * |
| * A set of languages is determined from the locale(s), and from those a set of acceptable Unicode scripts is |
| * determined. Characters from this set of scripts, along with characters from the "common" and "inherited" |
| * Unicode Script categories will be permitted. |
| * |
| * Supplying an empty string removes all restrictions; characters from any script will be allowed. |
| * |
| * The CHAR_LIMIT test is automatically enabled for this SpoofChecker when calling this function with a |
| * non-empty list of locales. |
| * |
| * The Unicode Set of characters that will be allowed is accessible via the getAllowedChars() function. |
| * setAllowedLocales() will <i>replace</i> any previously applied set of allowed characters. |
| * |
| * Adjustments, such as additions or deletions of certain classes of characters, can be made to the result of |
| * setAllowedLocales() by fetching the resulting set with getAllowedChars(), manipulating it with the Unicode |
| * Set API, then resetting the spoof detectors limits with setAllowedChars() |
| * |
| * @param locales |
| * A Set of ULocales, from which the language and associated script are extracted. If the locales Set |
| * is null, no restrictions will be placed on the allowed characters. |
| * |
| * @return self |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public Builder setAllowedLocales(Set<ULocale> locales) { |
| fAllowedCharsSet.clear(); |
| |
| for (ULocale locale : locales) { |
| // Add the script chars for this locale to the accumulating set |
| // of allowed chars. |
| addScriptChars(locale, fAllowedCharsSet); |
| } |
| |
| // If our caller provided an empty list of locales, we disable the |
| // allowed characters checking |
| fAllowedLocales = new LinkedHashSet<ULocale>(); |
| if (locales.size() == 0) { |
| fAllowedCharsSet.add(0, 0x10ffff); |
| fChecks &= ~CHAR_LIMIT; |
| return this; |
| } |
| |
| // Add all common and inherited characters to the set of allowed |
| // chars. |
| UnicodeSet tempSet = new UnicodeSet(); |
| tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.COMMON); |
| fAllowedCharsSet.addAll(tempSet); |
| tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.INHERITED); |
| fAllowedCharsSet.addAll(tempSet); |
| |
| // Store the updated spoof checker state. |
| fAllowedLocales.addAll(locales); |
| fChecks |= CHAR_LIMIT; |
| return this; |
| } |
| |
| // Add (union) to the UnicodeSet all of the characters for the scripts |
| // used for the specified locale. Part of the implementation of |
| // setAllowedLocales. |
| private void addScriptChars(ULocale locale, UnicodeSet allowedChars) { |
| int scripts[] = UScript.getCode(locale); |
| UnicodeSet tmpSet = new UnicodeSet(); |
| int i; |
| for (i = 0; i < scripts.length; i++) { |
| tmpSet.applyIntPropertyValue(UProperty.SCRIPT, scripts[i]); |
| allowedChars.addAll(tmpSet); |
| } |
| } |
| |
| /** |
| * Limit the acceptable characters to those specified by a Unicode Set. Any previously specified character limit |
| * is is replaced by the new settings. This includes limits on characters that were set with the |
| * setAllowedLocales() function. |
| * |
| * The CHAR_LIMIT test is automatically enabled for this SpoofChecker by this function. |
| * |
| * @param chars |
| * A Unicode Set containing the list of characters that are permitted. The incoming set is cloned by |
| * this function, so there are no restrictions on modifying or deleting the UnicodeSet after calling |
| * this function. Note that this clears the allowedLocales set. |
| * @return self |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public Builder setAllowedChars(UnicodeSet chars) { |
| fAllowedCharsSet = chars.cloneAsThawed(); |
| fAllowedLocales = new LinkedHashSet<ULocale>(); |
| fChecks |= CHAR_LIMIT; |
| return this; |
| } |
| |
| // Structure for the Whole Script Confusable Data |
| // See Unicode UAX-39, Unicode Security Mechanisms, for a description of the |
| // Whole Script confusable data |
| // |
| // The data provides mappings from code points to a set of scripts |
| // that contain characters that might be confused with the code point. |
| // There are two mappings, one for lower case only, and one for characters |
| // of any case. |
| // |
| // The actual data consists of a utrie2 to map from a code point to an offset, |
| // and an array of UScriptSets (essentially bit maps) that is indexed |
| // by the offsets obtained from the Trie. |
| // |
| // |
| |
| /* |
| * Internal functions for compililing Whole Script confusable source data into its binary (runtime) form. The |
| * binary data format is described in uspoof_impl.h |
| */ |
| private static class WSConfusableDataBuilder { |
| |
| // Regular expression for parsing a line from the Unicode file |
| // confusablesWholeScript.txt |
| // Example Lines: |
| // 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O |
| // 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN |
| // CAPITAL LETTER I |
| // | | | | |
| // | | | |---- Which table, Any Case or Lower Case (A or L) |
| // | | |----------Target script. We need this. |
| // | |----------------Src script. Should match the script of the source |
| // | code points. Beyond checking that, we don't keep it. |
| // |--------------------------------Source code points or range. |
| // |
| // The expression will match _all_ lines, including erroneous lines. |
| // The result of the parse is returned via the contents of the (match) |
| // groups. |
| static String parseExp = |
| |
| "(?m)" + // Multi-line mode |
| "^([ \\t]*(?:#.*?)?)$" + // A blank or comment line. Matches Group |
| // 1. |
| "|^(?:" + // OR |
| "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" + // Code point |
| // range. Groups |
| // 2 and 3. |
| "\\s*([A-Za-z]+)\\s*;" + // The source script. Group 4. |
| "\\s*([A-Za-z]+)\\s*;" + // The target script. Group 5. |
| "\\s*(?:(A)|(L))" + // The table A or L. Group 6 or 7 |
| "[ \\t]*(?:#.*?)?" + // Trailing commment |
| ")$|" + // OR |
| "^(.*?)$"; // An error line. Group 8. |
| |
| // Any line not matching the preceding |
| // parts of the expression.will match |
| // this, and thus be flagged as an error |
| |
| // Extract a regular expression match group into a char * string. |
| // The group must contain only invariant characters. |
| // Used for script names |
| // |
| |
| static void readWholeFileToString(Reader reader, StringBuffer buffer) throws java.io.IOException { |
| // Convert the user input data from UTF-8 to char (UTF-16) |
| LineNumberReader lnr = new LineNumberReader(reader); |
| do { |
| String line = lnr.readLine(); |
| if (line == null) { |
| break; |
| } |
| buffer.append(line); |
| buffer.append('\n'); |
| } while (true); |
| } |
| |
| // Build the Whole Script Confusable data |
| // |
| static void buildWSConfusableData(SpoofData fSpoofData, DataOutputStream os, Reader confusablesWS) |
| throws ParseException, java.io.IOException { |
| Pattern parseRegexp = null; |
| StringBuffer input = new StringBuffer(); |
| int lineNum = 0; |
| |
| Vector<BuilderScriptSet> scriptSets = null; |
| int rtScriptSetsCount = 2; |
| |
| Trie2Writable anyCaseTrie = new Trie2Writable(0, 0); |
| Trie2Writable lowerCaseTrie = new Trie2Writable(0, 0); |
| |
| // The scriptSets vector provides a mapping from TRIE values to the set |
| // of scripts. |
| // |
| // Reserved TRIE values: |
| // 0: Code point has no whole script confusables. |
| // 1: Code point is of script Common or Inherited. |
| // These code points do not participate in whole script confusable |
| // detection. |
| // (This is logically equivalent to saying that they contain confusables |
| // in all scripts) |
| // |
| // Because Trie values are indexes into the ScriptSets vector, pre-fill |
| // vector positions 0 and 1 to avoid conflicts with the reserved values. |
| scriptSets = new Vector<BuilderScriptSet>(); |
| scriptSets.addElement(null); |
| scriptSets.addElement(null); |
| |
| readWholeFileToString(confusablesWS, input); |
| |
| parseRegexp = Pattern.compile(parseExp); |
| |
| // Zap any Byte Order Mark at the start of input. Changing it to a space |
| // is benign |
| // given the syntax of the input. |
| if (input.charAt(0) == 0xfeff) { |
| input.setCharAt(0, (char) 0x20); |
| } |
| |
| // Parse the input, one line per iteration of this loop. |
| Matcher matcher = parseRegexp.matcher(input); |
| while (matcher.find()) { |
| lineNum++; |
| if (matcher.start(1) >= 0) { |
| // this was a blank or comment line. |
| continue; |
| } |
| if (matcher.start(8) >= 0) { |
| // input file syntax error. |
| throw new ParseException("ConfusablesWholeScript, line " + lineNum + ": Unrecognized input: " |
| + matcher.group(), matcher.start()); |
| } |
| |
| // Pick up the start and optional range end code points from the |
| // parsed line. |
| int startCodePoint = Integer.parseInt(matcher.group(2), 16); |
| if (startCodePoint > 0x10ffff) { |
| throw new ParseException("ConfusablesWholeScript, line " + lineNum |
| + ": out of range code point: " + matcher.group(2), matcher.start(2)); |
| } |
| int endCodePoint = startCodePoint; |
| if (matcher.start(3) >= 0) { |
| endCodePoint = Integer.parseInt(matcher.group(3), 16); |
| } |
| if (endCodePoint > 0x10ffff) { |
| throw new ParseException("ConfusablesWholeScript, line " + lineNum |
| + ": out of range code point: " + matcher.group(3), matcher.start(3)); |
| } |
| |
| // Extract the two script names from the source line. |
| String srcScriptName = matcher.group(4); |
| String targScriptName = matcher.group(5); |
| int srcScript = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, srcScriptName); |
| int targScript = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, targScriptName); |
| if (srcScript == UScript.INVALID_CODE) { |
| throw new ParseException("ConfusablesWholeScript, line " + lineNum |
| + ": Invalid script code t: " + matcher.group(4), matcher.start(4)); |
| } |
| if (targScript == UScript.INVALID_CODE) { |
| throw new ParseException("ConfusablesWholeScript, line " + lineNum |
| + ": Invalid script code t: " + matcher.group(5), matcher.start(5)); |
| } |
| |
| // select the table - (A) any case or (L) lower case only |
| Trie2Writable table = anyCaseTrie; |
| if (matcher.start(7) >= 0) { |
| table = lowerCaseTrie; |
| } |
| |
| // Build the set of scripts containing confusable characters for |
| // the code point(s) specified in this input line. |
| // Sanity check that the script of the source code point is the same |
| // as the source script indicated in the input file. Failure of this |
| // check is an error in the input file. |
| // |
| // Include the source script in the set (needed for Mixed Script |
| // Confusable detection). |
| // |
| int cp; |
| for (cp = startCodePoint; cp <= endCodePoint; cp++) { |
| int setIndex = table.get(cp); |
| BuilderScriptSet bsset = null; |
| if (setIndex > 0) { |
| assert (setIndex < scriptSets.size()); |
| bsset = scriptSets.elementAt(setIndex); |
| } else { |
| bsset = new BuilderScriptSet(); |
| bsset.codePoint = cp; |
| bsset.trie = table; |
| bsset.sset = new ScriptSet(); |
| setIndex = scriptSets.size(); |
| bsset.index = setIndex; |
| bsset.rindex = 0; |
| scriptSets.addElement(bsset); |
| table.set(cp, setIndex); |
| } |
| bsset.sset.Union(targScript); |
| bsset.sset.Union(srcScript); |
| |
| int cpScript = UScript.getScript(cp); |
| if (cpScript != srcScript) { |
| // status = U_INVALID_FORMAT_ERROR; |
| throw new ParseException("ConfusablesWholeScript, line " + lineNum |
| + ": Mismatch between source script and code point " + Integer.toString(cp, 16), |
| matcher.start(5)); |
| } |
| } |
| } |
| |
| // Eliminate duplicate script sets. At this point we have a separate |
| // script set for every code point that had data in the input file. |
| // |
| // We eliminate underlying ScriptSet objects, not the BuildScriptSets |
| // that wrap them |
| // |
| // printf("Number of scriptSets: %d\n", scriptSets.size()); |
| { |
| int duplicateCount = 0; |
| rtScriptSetsCount = 2; |
| for (int outeri = 2; outeri < scriptSets.size(); outeri++) { |
| BuilderScriptSet outerSet = scriptSets.elementAt(outeri); |
| if (outerSet.index != outeri) { |
| // This set was already identified as a duplicate. |
| // It will not be allocated a position in the runtime array |
| // of ScriptSets. |
| continue; |
| } |
| outerSet.rindex = rtScriptSetsCount++; |
| for (int inneri = outeri + 1; inneri < scriptSets.size(); inneri++) { |
| BuilderScriptSet innerSet = scriptSets.elementAt(inneri); |
| if (outerSet.sset.equals(innerSet.sset) && outerSet.sset != innerSet.sset) { |
| innerSet.sset = outerSet.sset; |
| innerSet.index = outeri; |
| innerSet.rindex = outerSet.rindex; |
| duplicateCount++; |
| } |
| // But this doesn't get all. We need to fix the TRIE. |
| } |
| } |
| // printf("Number of distinct script sets: %d\n", |
| // rtScriptSetsCount); |
| } |
| |
| // Update the Trie values to be reflect the run time script indexes |
| // (after duplicate merging). |
| // (Trie Values 0 and 1 are reserved, and the corresponding slots in |
| // scriptSets |
| // are unused, which is why the loop index starts at 2.) |
| { |
| for (int i = 2; i < scriptSets.size(); i++) { |
| BuilderScriptSet bSet = scriptSets.elementAt(i); |
| if (bSet.rindex != i) { |
| bSet.trie.set(bSet.codePoint, bSet.rindex); |
| } |
| } |
| } |
| |
| // For code points with script==Common or script==Inherited, |
| // Set the reserved value of 1 into both Tries. These characters do not |
| // participate |
| // in Whole Script Confusable detection; this reserved value is the |
| // means |
| // by which they are detected. |
| { |
| UnicodeSet ignoreSet = new UnicodeSet(); |
| ignoreSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.COMMON); |
| UnicodeSet inheritedSet = new UnicodeSet(); |
| inheritedSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.INHERITED); |
| ignoreSet.addAll(inheritedSet); |
| for (int rn = 0; rn < ignoreSet.getRangeCount(); rn++) { |
| int rangeStart = ignoreSet.getRangeStart(rn); |
| int rangeEnd = ignoreSet.getRangeEnd(rn); |
| anyCaseTrie.setRange(rangeStart, rangeEnd, 1, true); |
| lowerCaseTrie.setRange(rangeStart, rangeEnd, 1, true); |
| } |
| } |
| |
| // Serialize the data to the Spoof Detector |
| { |
| anyCaseTrie.toTrie2_16().serialize(os); |
| lowerCaseTrie.toTrie2_16().serialize(os); |
| |
| fSpoofData.fRawData.fScriptSetsLength = rtScriptSetsCount; |
| int rindex = 2; |
| for (int i = 2; i < scriptSets.size(); i++) { |
| BuilderScriptSet bSet = scriptSets.elementAt(i); |
| if (bSet.rindex < rindex) { |
| // We have already copied this script set to the serialized |
| // data. |
| continue; |
| } |
| assert (rindex == bSet.rindex); |
| bSet.sset.output(os); |
| rindex++; |
| } |
| } |
| } |
| |
| // class BuilderScriptSet. Represents the set of scripts (Script Codes) |
| // containing characters that are confusable with one specific |
| // code point. |
| private static class BuilderScriptSet { |
| int codePoint; // The source code point. |
| Trie2Writable trie; // Any-case or Lower-case Trie. |
| // These Trie tables are the final result of the |
| // build. This flag indicates which of the two |
| // this set of data is for. |
| ScriptSet sset; // The set of scripts itself. |
| |
| // Vectors of all B |
| int index; // Index of this set in the Build Time vector |
| // of script sets. |
| int rindex; // Index of this set in the final (runtime) |
| |
| // array of sets. |
| |
| // its underlying sset. |
| |
| BuilderScriptSet() { |
| codePoint = -1; |
| trie = null; |
| sset = null; |
| index = 0; |
| rindex = 0; |
| } |
| } |
| |
| } |
| |
| /* |
| * ***************************************************************************** |
| * Internal classes for compililing confusable data into its binary (runtime) form. |
| * ***************************************************************************** |
| */ |
| // --------------------------------------------------------------------- |
| // |
| // buildConfusableData Compile the source confusable data, as defined by |
| // the Unicode data file confusables.txt, into the binary |
| // structures used by the confusable detector. |
| // |
| // The binary structures are described in uspoof_impl.h |
| // |
| // 1. parse the data, building 4 hash tables, one each for the SL, SA, ML and MA |
| // tables. Each maps from a int to a String. |
| // |
| // 2. Sort all of the strings encountered by length, since they will need to |
| // be stored in that order in the final string table. |
| // |
| // 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the |
| // list because that will be the ordering of our runtime table. |
| // |
| // 4. Generate the run time string table. This is generated before the key & value |
| // tables because we need the string indexes when building those tables. |
| // |
| // 5. Build the run-time key and value tables. These are parallel tables, and |
| // are built at the same time |
| |
| // class ConfusabledataBuilder |
| // An instance of this class exists while the confusable data is being built |
| // from source. |
| // It encapsulates the intermediate data structures that are used for building. |
| // It exports one static function, to do a confusable data build. |
| private static class ConfusabledataBuilder { |
| private SpoofData fSpoofData; |
| private ByteArrayOutputStream bos; |
| private DataOutputStream os; |
| private Hashtable<Integer, SPUString> fSLTable; |
| private Hashtable<Integer, SPUString> fSATable; |
| private Hashtable<Integer, SPUString> fMLTable; |
| private Hashtable<Integer, SPUString> fMATable; |
| private UnicodeSet fKeySet; // A set of all keys (UChar32s) that go into the |
| // four mapping tables. |
| |
| // The binary data is first assembled into the following four collections, |
| // then output to the DataOutputStream os. |
| private StringBuffer fStringTable; |
| private Vector<Integer> fKeyVec; |
| private Vector<Integer> fValueVec; |
| private Vector<Integer> fStringLengthsTable; |
| private SPUStringPool stringPool; |
| private Pattern fParseLine; |
| private Pattern fParseHexNum; |
| private int fLineNum; |
| |
| ConfusabledataBuilder(SpoofData spData, ByteArrayOutputStream bos) { |
| this.bos = bos; |
| this.os = new DataOutputStream(bos); |
| fSpoofData = spData; |
| fSLTable = new Hashtable<Integer, SPUString>(); |
| fSATable = new Hashtable<Integer, SPUString>(); |
| fMLTable = new Hashtable<Integer, SPUString>(); |
| fMATable = new Hashtable<Integer, SPUString>(); |
| fKeySet = new UnicodeSet(); |
| fKeyVec = new Vector<Integer>(); |
| fValueVec = new Vector<Integer>(); |
| stringPool = new SPUStringPool(); |
| } |
| |
| void build(Reader confusables) throws ParseException, java.io.IOException { |
| StringBuffer fInput = new StringBuffer(); |
| WSConfusableDataBuilder.readWholeFileToString(confusables, fInput); |
| |
| // Regular Expression to parse a line from Confusables.txt. The |
| // expression will match |
| // any line. What was matched is determined by examining which capture |
| // groups have a match. |
| // Capture Group 1: the source char |
| // Capture Group 2: the replacement chars |
| // Capture Group 3-6 the table type, SL, SA, ML, or MA |
| // Capture Group 7: A blank or comment only line. |
| // Capture Group 8: A syntactically invalid line. Anything that didn't |
| // match before. |
| // Example Line from the confusables.txt source file: |
| // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... " |
| fParseLine = Pattern.compile("(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" + // Match |
| // the |
| // source |
| // char |
| "[ \\t]*([0-9A-Fa-f]+" + // Match the replacement char(s) |
| "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" + // (continued) |
| "\\s*(?:(SL)|(SA)|(ML)|(MA))" + // Match the table type |
| "[ \\t]*(?:#.*?)?$" + // Match any trailing #comment |
| "|^([ \\t]*(?:#.*?)?)$" + // OR match empty lines or lines with |
| // only a #comment |
| "|^(.*?)$"); // OR match any line, which catches illegal lines. |
| |
| // Regular expression for parsing a hex number out of a space-separated |
| // list of them. |
| // Capture group 1 gets the number, with spaces removed. |
| fParseHexNum = Pattern.compile("\\s*([0-9A-F]+)"); |
| |
| // Zap any Byte Order Mark at the start of input. Changing it to a space |
| // is benign |
| // given the syntax of the input. |
| if (fInput.charAt(0) == 0xfeff) { |
| fInput.setCharAt(0, (char) 0x20); |
| } |
| |
| // Parse the input, one line per iteration of this loop. |
| Matcher matcher = fParseLine.matcher(fInput); |
| while (matcher.find()) { |
| fLineNum++; |
| if (matcher.start(7) >= 0) { |
| // this was a blank or comment line. |
| continue; |
| } |
| if (matcher.start(8) >= 0) { |
| // input file syntax error. |
| // status = U_PARSE_ERROR; |
| throw new ParseException("Confusables, line " + fLineNum + ": Unrecognized Line: " |
| + matcher.group(8), matcher.start(8)); |
| } |
| |
| // We have a good input line. Extract the key character and mapping |
| // string, and |
| // put them into the appropriate mapping table. |
| int keyChar = Integer.parseInt(matcher.group(1), 16); |
| if (keyChar > 0x10ffff) { |
| throw new ParseException("Confusables, line " + fLineNum + ": Bad code point: " |
| + matcher.group(1), matcher.start(1)); |
| } |
| Matcher m = fParseHexNum.matcher(matcher.group(2)); |
| |
| StringBuffer mapString = new StringBuffer(); |
| while (m.find()) { |
| int c = Integer.parseInt(m.group(1), 16); |
| if (keyChar > 0x10ffff) { |
| throw new ParseException("Confusables, line " + fLineNum + ": Bad code point: " |
| + Integer.toString(c, 16), matcher.start(2)); |
| } |
| mapString.append(c); |
| } |
| assert (mapString.length() >= 1); |
| |
| // Put the map (value) string into the string pool |
| // This a little like a Java intern() - any duplicates will be |
| // eliminated. |
| SPUString smapString = stringPool.addString(mapString.toString()); |
| |
| // Add the char . string mapping to the appropriate table. |
| Hashtable<Integer, SPUString> table = matcher.start(3) >= 0 ? fSLTable |
| : matcher.start(4) >= 0 ? fSATable : matcher.start(5) >= 0 ? fMLTable |
| : matcher.start(6) >= 0 ? fMATable : null; |
| assert (table != null); |
| table.put(keyChar, smapString); |
| fKeySet.add(keyChar); |
| } |
| |
| // Input data is now all parsed and collected. |
| // Now create the run-time binary form of the data. |
| // |
| // This is done in two steps. First the data is assembled into vectors |
| // and strings, |
| // for ease of construction, then the contents of these collections are |
| // dumped |
| // into the actual raw-bytes data storage. |
| |
| // Build up the string array, and record the index of each string |
| // therein |
| // in the (build time only) string pool. |
| // Strings of length one are not entered into the strings array. |
| // At the same time, build up the string lengths table, which records |
| // the |
| // position in the string table of the first string of each length >= 4. |
| // (Strings in the table are sorted by length) |
| stringPool.sort(); |
| fStringTable = new StringBuffer(); |
| fStringLengthsTable = new Vector<Integer>(); |
| int previousStringLength = 0; |
| int previousStringIndex = 0; |
| int poolSize = stringPool.size(); |
| int i; |
| for (i = 0; i < poolSize; i++) { |
| SPUString s = stringPool.getByIndex(i); |
| int strLen = s.fStr.length(); |
| int strIndex = fStringTable.length(); |
| assert (strLen >= previousStringLength); |
| if (strLen == 1) { |
| // strings of length one do not get an entry in the string |
| // table. |
| // Keep the single string character itself here, which is the |
| // same |
| // convention that is used in the final run-time string table |
| // index. |
| s.fStrTableIndex = s.fStr.charAt(0); |
| } else { |
| if ((strLen > previousStringLength) && (previousStringLength >= 4)) { |
| fStringLengthsTable.addElement(previousStringIndex); |
| fStringLengthsTable.addElement(previousStringLength); |
| } |
| s.fStrTableIndex = strIndex; |
| fStringTable.append(s.fStr); |
| } |
| previousStringLength = strLen; |
| previousStringIndex = strIndex; |
| } |
| // Make the final entry to the string lengths table. |
| // (it holds an entry for the _last_ string of each length, so adding |
| // the |
| // final one doesn't happen in the main loop because no longer string |
| // was encountered.) |
| if (previousStringLength >= 4) { |
| fStringLengthsTable.addElement(previousStringIndex); |
| fStringLengthsTable.addElement(previousStringLength); |
| } |
| |
| // Construct the compile-time Key and Value tables |
| // |
| // For each key code point, check which mapping tables it applies to, |
| // and create the final data for the key & value structures. |
| // |
| // The four logical mapping tables are conflated into one combined |
| // table. |
| // If multiple logical tables have the same mapping for some key, they |
| // share a single entry in the combined table. |
| // If more than one mapping exists for the same key code point, multiple |
| // entries will be created in the table |
| |
| for (int range = 0; range < fKeySet.getRangeCount(); range++) { |
| // It is an oddity of the UnicodeSet API that simply enumerating the |
| // contained |
| // code points requires a nested loop. |
| for (int keyChar = fKeySet.getRangeStart(range); keyChar <= fKeySet.getRangeEnd(range); keyChar++) { |
| addKeyEntry(keyChar, fSLTable, SpoofChecker.SL_TABLE_FLAG); |
| addKeyEntry(keyChar, fSATable, SpoofChecker.SA_TABLE_FLAG); |
| addKeyEntry(keyChar, fMLTable, SpoofChecker.ML_TABLE_FLAG); |
| addKeyEntry(keyChar, fMATable, SpoofChecker.MA_TABLE_FLAG); |
| } |
| } |
| |
| // Put the assembled data into the flat runtime array |
| outputData(); |
| |
| // All of the intermediate allocated data belongs to the |
| // ConfusabledataBuilder object (this), and is deleted by Java GC. |
| } |
| |
| // Add an entry to the key and value tables being built |
| // input: data from SLTable, MATable, etc. |
| // outut: entry added to fKeyVec and fValueVec |
| // addKeyEntry Construction of the confusable Key and Mapping Values tables. |
| // This is an intermediate point in the building process. |
| // We already have the mappings in the hash tables fSLTable, etc. |
| // This function builds corresponding run-time style table entries into |
| // fKeyVec and fValueVec |
| void addKeyEntry(int keyChar, // The key character |
| Hashtable<Integer, SPUString> table, // The table, one of SATable, |
| // MATable, etc. |
| int tableFlag) { // One of SA_TABLE_FLAG, etc. |
| SPUString targetMapping = table.get(keyChar); |
| if (targetMapping == null) { |
| // No mapping for this key character. |
| // (This function is called for all four tables for each key char |
| // that |
| // is seen anywhere, so this no entry cases are very much expected.) |
| return; |
| } |
| |
| // Check whether there is already an entry with the correct mapping. |
| // If so, simply set the flag in the keyTable saying that the existing |
| // entry |
| // applies to the table that we're doing now. |
| boolean keyHasMultipleValues = false; |
| int i; |
| for (i = fKeyVec.size() - 1; i >= 0; i--) { |
| int key = fKeyVec.elementAt(i); |
| if ((key & 0x0ffffff) != keyChar) { |
| // We have now checked all existing key entries for this key |
| // char (if any) |
| // without finding one with the same mapping. |
| break; |
| } |
| String mapping = getMapping(i); |
| if (mapping.equals(targetMapping.fStr)) { |
| // The run time entry we are currently testing has the correct |
| // mapping. |
| // Set the flag in it indicating that it applies to the new |
| // table also. |
| key |= tableFlag; |
| fKeyVec.setElementAt(key, i); |
| return; |
| } |
| keyHasMultipleValues = true; |
| } |
| |
| // Need to add a new entry to the binary data being built for this |
| // mapping. |
| // Includes adding entries to both the key table and the parallel values |
| // table. |
| int newKey = keyChar | tableFlag; |
| if (keyHasMultipleValues) { |
| newKey |= SpoofChecker.KEY_MULTIPLE_VALUES; |
| } |
| int adjustedMappingLength = targetMapping.fStr.length() - 1; |
| if (adjustedMappingLength > 3) { |
| adjustedMappingLength = 3; |
| } |
| newKey |= adjustedMappingLength << SpoofChecker.KEY_LENGTH_SHIFT; |
| |
| int newData = targetMapping.fStrTableIndex; |
| |
| fKeyVec.addElement(newKey); |
| fValueVec.addElement(newData); |
| |
| // If the preceding key entry is for the same key character (but with a |
| // different mapping) |
| // set the multiple-values flag on it. |
| if (keyHasMultipleValues) { |
| int previousKeyIndex = fKeyVec.size() - 2; |
| int previousKey = fKeyVec.elementAt(previousKeyIndex); |
| previousKey |= SpoofChecker.KEY_MULTIPLE_VALUES; |
| fKeyVec.setElementAt(previousKey, previousKeyIndex); |
| } |
| } |
| |
| // From an index into fKeyVec & fValueVec |
| // get a String with the corresponding mapping. |
| String getMapping(int index) { |
| int key = fKeyVec.elementAt(index); |
| int value = fValueVec.elementAt(index); |
| int length = SpoofChecker.getKeyLength(key); |
| int lastIndexWithLen; |
| switch (length) { |
| case 0: |
| char[] cs = { (char) value }; |
| return new String(cs); |
| case 1: |
| case 2: |
| return fStringTable.substring(value, value + length + 1); // Note: +1 as optimization |
| case 3: |
| length = 0; |
| int i; |
| for (i = 0; i < fStringLengthsTable.size(); i += 2) { |
| lastIndexWithLen = fStringLengthsTable.elementAt(i); |
| if (value <= lastIndexWithLen) { |
| length = fStringLengthsTable.elementAt(i + 1); |
| break; |
| } |
| } |
| assert (length >= 3); |
| return fStringTable.substring(value, value + length); |
| default: |
| assert (false); |
| } |
| return new String(); |
| } |
| |
| // Populate the final binary output data array with the compiled data. |
| // The confusable data has been compiled and stored in intermediate |
| // collections and strings. Copy it from there to the final flat |
| // binary array. |
| void outputData() throws java.io.IOException { |
| |
| SpoofDataHeader rawData = fSpoofData.fRawData; |
| // The Key Table |
| // While copying the keys to the runtime array, |
| // also sanity check that they are sorted. |
| int numKeys = fKeyVec.size(); |
| int i; |
| int previousKey = 0; |
| rawData.output(os); |
| rawData.fCFUKeys = os.size(); |
| assert (rawData.fCFUKeys == 128); |
| rawData.fCFUKeysSize = numKeys; |
| for (i = 0; i < numKeys; i++) { |
| int key = fKeyVec.elementAt(i); |
| assert ((key & 0x00ffffff) >= (previousKey & 0x00ffffff)); |
| assert ((key & 0xff000000) != 0); |
| os.writeInt(key); |
| previousKey = key; |
| } |
| |
| // The Value Table, parallels the key table |
| int numValues = fValueVec.size(); |
| assert (numKeys == numValues); |
| rawData.fCFUStringIndex = os.size(); |
| rawData.fCFUStringIndexSize = numValues; |
| for (i = 0; i < numValues; i++) { |
| int value = fValueVec.elementAt(i); |
| assert (value < 0xffff); |
| os.writeShort((short) value); |
| } |
| |
| // The Strings Table. |
| |
| int stringsLength = fStringTable.length(); |
| // Reserve an extra space so the string will be nul-terminated. This is |
| // only a convenience, for when debugging; it is not needed otherwise. |
| String strings = fStringTable.toString(); |
| rawData.fCFUStringTable = os.size(); |
| rawData.fCFUStringTableLen = stringsLength; |
| for (i = 0; i < stringsLength; i++) { |
| os.writeChar(strings.charAt(i)); |
| } |
| |
| // The String Lengths Table |
| // While copying into the runtime array do some sanity checks on the |
| // values |
| // Each complete entry contains two fields, an index and an offset. |
| // Lengths should increase with each entry. |
| // Offsets should be less than the size of the string table. |
| int lengthTableLength = fStringLengthsTable.size(); |
| int previousLength = 0; |
| // Note: StringLengthsSize in the raw data is the number of complete |
| // entries, |
| // each consisting of a pair of 16 bit values, hence the divide by 2. |
| rawData.fCFUStringLengthsSize = lengthTableLength / 2; |
| rawData.fCFUStringLengths = os.size(); |
| for (i = 0; i < lengthTableLength; i += 2) { |
| int offset = fStringLengthsTable.elementAt(i); |
| int length = fStringLengthsTable.elementAt(i + 1); |
| assert (offset < stringsLength); |
| assert (length < 40); |
| assert (length > previousLength); |
| os.writeShort((short) offset); |
| os.writeShort((short) length); |
| previousLength = length; |
| } |
| |
| os.flush(); |
| DataInputStream is = new DataInputStream(new ByteArrayInputStream(bos.toByteArray())); |
| is.mark(Integer.MAX_VALUE); |
| fSpoofData.initPtrs(is); |
| } |
| |
| public static void buildConfusableData(SpoofData spData, Reader confusables) throws java.io.IOException, |
| ParseException { |
| ByteArrayOutputStream bos = new ByteArrayOutputStream(); |
| ConfusabledataBuilder builder = new ConfusabledataBuilder(spData, bos); |
| builder.build(confusables); |
| } |
| |
| /* |
| * ***************************************************************************** |
| * Internal classes for compiling confusable data into its binary (runtime) form. |
| * ***************************************************************************** |
| */ |
| // SPUString |
| // Holds a string that is the result of one of the mappings defined |
| // by the confusable mapping data (confusables.txt from Unicode.org) |
| // Instances of SPUString exist during the compilation process only. |
| |
| private static class SPUString { |
| String fStr; // The actual string. |
| int fStrTableIndex; // Index into the final runtime data for this string. |
| |
| // (or, for length 1, the single string char itself, |
| // there being no string table entry for it.) |
| SPUString(String s) { |
| fStr = s; |
| fStrTableIndex = 0; |
| } |
| } |
| |
| // Comparison function for ordering strings in the string pool. |
| // Compare by length first, then, within a group of the same length, |
| // by code point order. |
| // Conforms to the type signature for a USortComparator in uvector.h |
| private static class SPUStringComparator implements Comparator<SPUString> { |
| public int compare(SPUString sL, SPUString sR) { |
| int lenL = sL.fStr.length(); |
| int lenR = sR.fStr.length(); |
| if (lenL < lenR) { |
| return -1; |
| } else if (lenL > lenR) { |
| return 1; |
| } else { |
| return sL.fStr.compareTo(sR.fStr); |
| } |
| } |
| } |
| |
| // String Pool A utility class for holding the strings that are the result of |
| // the spoof mappings. These strings will utimately end up in the |
| // run-time String Table. |
| // This is sort of like a sorted set of strings, except that ICU's anemic |
| // built-in collections don't support those, so it is implemented with a |
| // combination of a uhash and a Vector. |
| private static class SPUStringPool { |
| public SPUStringPool() { |
| fVec = new Vector<SPUString>(); |
| fHash = new Hashtable<String, SPUString>(); |
| } |
| |
| public int size() { |
| return fVec.size(); |
| } |
| |
| // Get the n-th string in the collection. |
| public SPUString getByIndex(int index) { |
| SPUString retString = fVec.elementAt(index); |
| return retString; |
| } |
| |
| // Add a string. Return the string from the table. |
| // If the input parameter string is already in the table, delete the |
| // input parameter and return the existing string. |
| public SPUString addString(String src) { |
| SPUString hashedString = fHash.get(src); |
| if (hashedString == null) { |
| hashedString = new SPUString(src); |
| fHash.put(src, hashedString); |
| fVec.addElement(hashedString); |
| } |
| return hashedString; |
| } |
| |
| // Sort the contents; affects the ordering of getByIndex(). |
| public void sort() { |
| Collections.sort(fVec, new SPUStringComparator()); |
| } |
| |
| private Vector<SPUString> fVec; // Elements are SPUString * |
| private Hashtable<String, SPUString> fHash; // Key: Value: |
| } |
| |
| } |
| } |
| |
| /** |
| * Get the set of checks that this Spoof Checker has been configured to perform. |
| * |
| * @return The set of checks that this spoof checker will perform. |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public int getChecks() { |
| return fChecks; |
| } |
| |
| /** |
| * Get a list of locales for the scripts that are acceptable in strings to be checked. If no limitations on scripts |
| * have been specified, an empty set will be returned. |
| * |
| * setAllowedChars() will reset the list of allowed locales to be empty. |
| * |
| * The returned set may not be identical to the originally specified set that is supplied to setAllowedLocales(); |
| * the information other than languages from the originally specified locales may be omitted. |
| * |
| * @return A set of locales corresponding to the acceptable scripts. |
| * |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public Set<ULocale> getAllowedLocales() { |
| return fAllowedLocales; |
| } |
| |
| /** |
| * Get a UnicodeSet for the characters permitted in an identifier. This corresponds to the limits imposed by the Set |
| * Allowed Characters functions. Limitations imposed by other checks will not be reflected in the set returned by |
| * this function. |
| * |
| * The returned set will be frozen, meaning that it cannot be modified by the caller. |
| * |
| * @return A UnicodeSet containing the characters that are permitted by the CHAR_LIMIT test. |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public UnicodeSet getAllowedChars() { |
| return fAllowedCharsSet; |
| } |
| |
| /** |
| * A struct-like class to hold the results of a Spoof Check operation. |
| * Tells which check(s) have failed |
| * and the position within the string where the failure was found. |
| * |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public static class CheckResult { |
| /** |
| * Indicate which of the spoof check(s) has failed. The value is a bitwise OR |
| * of the constants for the tests in question, SINGLE_SCRIPT_CONFUSABLE, |
| * MIXED_SCRIPT_CONFUSABLE, WHOLE_SCRIPT_CONFUSABLE, and so on. |
| * |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public int checks; |
| /** |
| * The index of the first string position that failed a check. |
| * |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public int position; |
| |
| /** |
| * Default constructor |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public CheckResult() { |
| checks = 0; |
| position = 0; |
| } |
| } |
| |
| /** |
| * Check the specified string for possible security issues. The text to be checked will typically be an identifier |
| * of some sort. The set of checks to be performed was specified when building the SpoofChecker. |
| * |
| * @param text |
| * A String to be checked for possible security issues. |
| * @param checkResult |
| * Output parameter, indicates which specific tests failed. |
| * May be null if the information is not wanted. |
| * @return True there any issue is found with the input string. |
| * @draft ICU 4.8 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public boolean failsChecks(String text, CheckResult checkResult) { |
| int length = text.length(); |
| |
| int result = 0; |
| int failPos = Integer.MAX_VALUE; |
| |
| // A count of the number of non-Common or inherited scripts. |
| // Needed for both the SINGLE_SCRIPT and the |
| // WHOLE/MIXED_SCIRPT_CONFUSABLE tests. |
| // Share the computation when possible. scriptCount == -1 means that we |
| // haven't done it yet. |
| int scriptCount = -1; |
| |
| if (0 != ((this.fChecks) & SINGLE_SCRIPT)) { |
| scriptCount = this.scriptScan(text, checkResult); |
| // no need to set failPos, it will be set to checkResult.position inside this.scriptScan |
| // printf("scriptCount (clipped to 2) = %d\n", scriptCount); |
| if (scriptCount >= 2) { |
| // Note: scriptCount == 2 covers all cases of the number of |
| // scripts >= 2 |
| result |= SINGLE_SCRIPT; |
| } |
| } |
| |
| if (0 != (this.fChecks & CHAR_LIMIT)) { |
| int i; |
| int c; |
| for (i = 0; i < length;) { |
| // U16_NEXT(text, i, length, c); |
| c = Character.codePointAt(text, i); |
| i = Character.offsetByCodePoints(text, i, 1); |
| if (!this.fAllowedCharsSet.contains(c)) { |
| result |= CHAR_LIMIT; |
| if (i < failPos) { |
| failPos = i; |
| } |
| break; |
| } |
| } |
| } |
| |
| if (0 != (this.fChecks & (WHOLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE | INVISIBLE))) { |
| // These are the checks that need to be done on NFD input |
| String nfdText = Normalizer.normalize(text, Normalizer.NFD, 0); |
| |
| if (0 != (this.fChecks & INVISIBLE)) { |
| |
| // scan for more than one occurence of the same non-spacing mark |
| // in a sequence of non-spacing marks. |
| int i; |
| int c; |
| int firstNonspacingMark = 0; |
| boolean haveMultipleMarks = false; |
| UnicodeSet marksSeenSoFar = new UnicodeSet(); // Set of combining marks in a |
| // single combining sequence. |
| for (i = 0; i < length;) { |
| // U16_NEXT(nfdText, i, nfdLength, c); |
| c = Character.codePointAt(nfdText, i); |
| i = Character.offsetByCodePoints(nfdText, i, 1); |
| if (Character.getType(c) != UCharacterCategory.NON_SPACING_MARK) { |
| firstNonspacingMark = 0; |
| if (haveMultipleMarks) { |
| marksSeenSoFar.clear(); |
| haveMultipleMarks = false; |
| } |
| continue; |
| } |
| if (firstNonspacingMark == 0) { |
| firstNonspacingMark = c; |
| continue; |
| } |
| if (!haveMultipleMarks) { |
| marksSeenSoFar.add(firstNonspacingMark); |
| haveMultipleMarks = true; |
| } |
| if (marksSeenSoFar.contains(c)) { |
| // report the error, and stop scanning. |
| // No need to find more than the first failure. |
| result |= INVISIBLE; |
| failPos = i; |
| break; |
| } |
| marksSeenSoFar.add(c); |
| } |
| } |
| |
| if (0 != (this.fChecks & (WHOLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE))) { |
| // The basic test is the same for both whole and mixed script |
| // confusables. |
| // Compute the set of scripts that every input character has a |
| // confusable in. |
| // For this computation an input character is always considered |
| // to be |
| // confusable with itself in its own script. |
| // If the number of such scripts is two or more, and the input |
| // consisted of |
| // characters all from a single script, we have a whole script |
| // confusable. |
| // (The two scripts will be the original script and the one that |
| // is confusable) |
| // If the number of such scripts >= one, and the original input |
| // contained characters from |
| // more than one script, we have a mixed script confusable. (We |
| // can transform |
| // some of the characters, and end up with a visually similar |
| // string all in |
| // one script.) |
| |
| if (scriptCount == -1) { |
| scriptCount = this.scriptScan(text, null); |
| } |
| |
| ScriptSet scripts = new ScriptSet(); |
| this.wholeScriptCheck(nfdText, scripts); |
| int confusableScriptCount = scripts.countMembers(); |
| // printf("confusableScriptCount = %d\n", |
| // confusableScriptCount); |
| |
| if ((0 != (this.fChecks & WHOLE_SCRIPT_CONFUSABLE)) && confusableScriptCount >= 2 && scriptCount == 1) { |
| result |= WHOLE_SCRIPT_CONFUSABLE; |
| } |
| |
| if ((0 != (this.fChecks & MIXED_SCRIPT_CONFUSABLE)) && confusableScriptCount >= 1 && scriptCount > 1) { |
| result |= MIXED_SCRIPT_CONFUSABLE; |
| } |
| } |
| } |
| if (checkResult != null) { |
| checkResult.checks = result; |
| if (failPos != Integer.MAX_VALUE) { |
| checkResult.position = failPos; |
| } |
| } |
| return (0 != result); |
| } |
| |
| /** |
| * Check the specified string for possible security issues. The text to be checked will typically be an identifier |
| * of some sort. The set of checks to be performed was specified when building the SpoofChecker. |
| * |
| * @param text |
| * A String to be checked for possible security issues. |
| * @return True there any issue is found with the input string. |
| * @draft ICU 4.8 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public boolean failsChecks(String text) { |
| return failsChecks(text, null); |
| } |
| |
| /** |
| * Check the whether two specified strings are visually confusable. The types of confusability to be tested - single |
| * script, mixed script, or whole script - are determined by the check options set for the SpoofChecker. |
| * |
| * The tests to be performed are controlled by the flags SINGLE_SCRIPT_CONFUSABLE MIXED_SCRIPT_CONFUSABLE |
| * WHOLE_SCRIPT_CONFUSABLE At least one of these tests must be selected. |
| * |
| * ANY_CASE is a modifier for the tests. Select it if the identifiers may be of mixed case. If identifiers are case |
| * folded for comparison and display to the user, do not select the ANY_CASE option. |
| * |
| * |
| * @param s1 |
| * The first of the two strings to be compared for confusability. |
| * @param s2 |
| * The second of the two strings to be compared for confusability. |
| * @return Non-zero if s1 and s1 are confusable. If not 0, the value will indicate the type(s) of confusability |
| * found, as defined by spoof check test constants. |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public int areConfusable(String s1, String s2) { |
| // |
| // See section 4 of UAX 39 for the algorithm for checking whether two |
| // strings are confusable, |
| // and for definitions of the types (single, whole, mixed-script) of |
| // confusables. |
| |
| // We only care about a few of the check flags. Ignore the others. |
| // If no tests relavant to this function have been specified, signal an |
| // error. |
| // TODO: is this really the right thing to do? It's probably an error on |
| // the caller's part, but logically we would just return 0 (no error). |
| if ((this.fChecks & (SINGLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE | WHOLE_SCRIPT_CONFUSABLE)) == 0) { |
| throw new IllegalArgumentException("No confusable checks are enabled."); |
| } |
| int flagsForSkeleton = this.fChecks & ANY_CASE; |
| String s1Skeleton; |
| String s2Skeleton; |
| |
| int result = 0; |
| int s1ScriptCount = this.scriptScan(s1, null); |
| int s2ScriptCount = this.scriptScan(s2, null); |
| |
| if (0 != (this.fChecks & SINGLE_SCRIPT_CONFUSABLE)) { |
| // Do the Single Script compare. |
| if (s1ScriptCount <= 1 && s2ScriptCount <= 1) { |
| flagsForSkeleton |= SINGLE_SCRIPT_CONFUSABLE; |
| s1Skeleton = getSkeleton(flagsForSkeleton, s1); |
| s2Skeleton = getSkeleton(flagsForSkeleton, s2); |
| if (s1Skeleton.length() == s2Skeleton.length() && s1Skeleton.equals(s2Skeleton)) { |
| result |= SINGLE_SCRIPT_CONFUSABLE; |
| } |
| } |
| } |
| |
| if (0 != (result & SINGLE_SCRIPT_CONFUSABLE)) { |
| // If the two inputs are single script confusable they cannot also |
| // be |
| // mixed or whole script confusable, according to the UAX39 |
| // definitions. |
| // So we can skip those tests. |
| return result; |
| } |
| |
| // Optimization for whole script confusables test: two identifiers are |
| // whole script confusable if |
| // each is of a single script and they are mixed script confusable. |
| boolean possiblyWholeScriptConfusables = s1ScriptCount <= 1 && s2ScriptCount <= 1 |
| && (0 != (this.fChecks & WHOLE_SCRIPT_CONFUSABLE)); |
| |
| // Mixed Script Check |
| if ((0 != (this.fChecks & MIXED_SCRIPT_CONFUSABLE)) || possiblyWholeScriptConfusables) { |
| // For getSkeleton(), resetting the SINGLE_SCRIPT_CONFUSABLE flag |
| // will get us |
| // the mixed script table skeleton, which is what we want. |
| // The Any Case / Lower Case bit in the skelton flags was set at the |
| // top of the function. |
| flagsForSkeleton &= ~SINGLE_SCRIPT_CONFUSABLE; |
| s1Skeleton = getSkeleton(flagsForSkeleton, s1); |
| s2Skeleton = getSkeleton(flagsForSkeleton, s2); |
| if (s1Skeleton.length() == s2Skeleton.length() && s1Skeleton.equals(s2Skeleton)) { |
| result |= MIXED_SCRIPT_CONFUSABLE; |
| if (possiblyWholeScriptConfusables) { |
| result |= WHOLE_SCRIPT_CONFUSABLE; |
| } |
| } |
| } |
| return result; |
| } |
| |
| /** |
| * Get the "skeleton" for an identifier string. Skeletons are a transformation of the input string; Two strings are |
| * confusable if their skeletons are identical. See Unicode UAX 39 for additional information. |
| * |
| * Using skeletons directly makes it possible to quickly check whether an identifier is confusable with any of some |
| * large set of existing identifiers, by creating an efficiently searchable collection of the skeletons. |
| * |
| * @param type |
| * The type of skeleton, corresponding to which of the Unicode confusable data tables to use. The default |
| * is Mixed-Script, Lowercase. Allowed options are SINGLE_SCRIPT_CONFUSABLE and ANY_CASE_CONFUSABLE. The |
| * two flags may be ORed. |
| * @param s |
| * The input string whose skeleton will be genereated. |
| * @return The output skeleton string. |
| * |
| * @draft ICU 4.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public String getSkeleton(int type, String s) { |
| // TODO: this function could be sped up a bit |
| // Skip the input normalization when not needed, work from callers data. |
| // It probably won't need normalization. |
| if ((type & ~(SINGLE_SCRIPT_CONFUSABLE | ANY_CASE)) != 0) { |
| // *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return null; |
| } |
| |
| int tableMask = 0; |
| switch (type) { |
| case 0: |
| tableMask = ML_TABLE_FLAG; |
| break; |
| case SINGLE_SCRIPT_CONFUSABLE: |
| tableMask = SL_TABLE_FLAG; |
| break; |
| case ANY_CASE: |
| tableMask = MA_TABLE_FLAG; |
| break; |
| case SINGLE_SCRIPT_CONFUSABLE | ANY_CASE: |
| tableMask = SA_TABLE_FLAG; |
| break; |
| default: |
| // *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return null; |
| } |
| |
| // NFD transform of the user supplied input |
| String nfdInput = Normalizer.normalize(s, Normalizer.NFD, 0); |
| int normalizedLen = nfdInput.length(); |
| |
| // Apply the skeleton mapping to the NFD normalized input string |
| // Accumulate the skeleton, possibly unnormalized, in a String. |
| int inputIndex = 0; |
| StringBuilder skelStr = new StringBuilder(); |
| while (inputIndex < normalizedLen) { |
| int c; |
| c = Character.codePointAt(nfdInput, inputIndex); |
| inputIndex = Character.offsetByCodePoints(nfdInput, inputIndex, 1); |
| this.confusableLookup(c, tableMask, skelStr); |
| } |
| |
| String result = skelStr.toString(); |
| String normedResult; |
| |
| // Check the skeleton for NFD, normalize it if needed. |
| // Unnormalized results should be very rare. |
| if (!Normalizer.isNormalized(result, Normalizer.NFD, 0)) { |
| normedResult = Normalizer.normalize(result, Normalizer.NFD, 0); |
| result = normedResult; |
| } |
| return result; |
| } |
| |
| /* |
| * Append the confusable skeleton transform for a single code point to a StringBuilder. The string to be appended |
| * will between 1 and 18 characters. |
| * |
| * This is the heart of the confusable skeleton generation implementation. |
| * |
| * @param tableMask bit flag specifying which confusable table to use. One of SL_TABLE_FLAG, MA_TABLE_FLAG, etc. |
| */ |
| private void confusableLookup(int inChar, int tableMask, StringBuilder dest) { |
| // Binary search the spoof data key table for the inChar |
| int low = 0; |
| int mid = 0; |
| int limit = fSpoofData.fRawData.fCFUKeysSize; |
| int midc; |
| boolean foundChar = false; |
| // [low, limit), i.e low is inclusive, limit is exclusive |
| do { |
| int delta = (limit - low) / 2; |
| mid = low + delta; |
| midc = fSpoofData.fCFUKeys[mid] & 0x1fffff; |
| if (inChar == midc) { |
| foundChar = true; |
| break; |
| } else if (inChar < midc) { |
| limit = mid; // limit is exclusive |
| } else { |
| // we have checked mid is not the char we looking for, the next |
| // char |
| // we want to check is (mid + 1) |
| low = mid + 1; // low is inclusive |
| } |
| } while (low < limit); |
| if (!foundChar) { // Char not found. It maps to itself. |
| dest.appendCodePoint(inChar); |
| return; |
| } |
| |
| boolean foundKey = false; |
| int keyFlags = fSpoofData.fCFUKeys[mid] & 0xff000000; |
| if ((keyFlags & tableMask) == 0) { |
| // We found the right key char, but the entry doesn't pertain to the |
| // table we need. See if there is an adjacent key that does |
| if (0 != (keyFlags & SpoofChecker.KEY_MULTIPLE_VALUES)) { |
| int altMid; |
| for (altMid = mid - 1; (fSpoofData.fCFUKeys[altMid] & 0x00ffffff) == inChar; altMid--) { |
| keyFlags = fSpoofData.fCFUKeys[altMid] & 0xff000000; |
| if (0 != (keyFlags & tableMask)) { |
| mid = altMid; |
| foundKey = true; |
| break; |
| } |
| } |
| if (!foundKey) { |
| for (altMid = mid + 1; (fSpoofData.fCFUKeys[altMid] & 0x00ffffff) == inChar; altMid++) { |
| keyFlags = fSpoofData.fCFUKeys[altMid] & 0xff000000; |
| if (0 != (keyFlags & tableMask)) { |
| mid = altMid; |
| foundKey = true; |
| break; |
| } |
| } |
| } |
| } |
| if (!foundKey) { |
| // No key entry for this char & table. |
| // The input char maps to itself. |
| dest.appendCodePoint(inChar); |
| return; |
| } |
| } |
| |
| int stringLen = getKeyLength(keyFlags) + 1; |
| int keyTableIndex = mid; |
| |
| // Value is either a char (for strings of length 1) or |
| // an index into the string table (for longer strings) |
| short value = fSpoofData.fCFUValues[keyTableIndex]; |
| if (stringLen == 1) { |
| dest.append((char) value); |
| return; |
| } |
| |
| // String length of 4 from the above lookup is used for all strings of |
| // length >= 4. |
| // For these, get the real length from the string lengths table, |
| // which maps string table indexes to lengths. |
| // All strings of the same length are stored contiguously in the string |
| // table. |
| // 'value' from the lookup above is the starting index for the desired |
| // string. |
| |
| int ix; |
| if (stringLen == 4) { |
| int stringLengthsLimit = fSpoofData.fRawData.fCFUStringLengthsSize; |
| for (ix = 0; ix < stringLengthsLimit; ix++) { |
| if (fSpoofData.fCFUStringLengths[ix].fLastString >= value) { |
| stringLen = fSpoofData.fCFUStringLengths[ix].fStrLength; |
| break; |
| } |
| } |
| assert (ix < stringLengthsLimit); |
| } |
| |
| assert (value + stringLen < fSpoofData.fRawData.fCFUStringTableLen); |
| dest.append(fSpoofData.fCFUStrings, value, stringLen); |
| return; |
| } |
| |
| // WholeScript and MixedScript check implementation. |
| // Implementation for Whole Script tests. |
| // Return the test bit flag to be ORed into the eventual user return value |
| // if a Spoof opportunity is detected. |
| // Input text is already normalized to NFD |
| // Return the set of scripts, each of which can represent something that is |
| // confusable with the input text. The script of the input text |
| // is included; input consisting of characters from a single script will |
| // always produce a result consisting of a set containing that script. |
| void wholeScriptCheck(CharSequence text, ScriptSet result) { |
| int inputIdx = 0; |
| int c; |
| |
| Trie2 table = (0 != (fChecks & ANY_CASE)) ? fSpoofData.fAnyCaseTrie : fSpoofData.fLowerCaseTrie; |
| result.setAll(); |
| while (inputIdx < text.length()) { |
| c = Character.codePointAt(text, inputIdx); |
| inputIdx = Character.offsetByCodePoints(text, inputIdx, 1); |
| int index = table.get(c); |
| if (index == 0) { |
| // No confusables in another script for this char. |
| // TODO: we should change the data to have sets with just the single script |
| // bit for the script of this char. Gets rid of this special case. |
| // Until then, grab the script from the char and intersect it with the set. |
| int cpScript = UScript.getScript(c); |
| assert (cpScript > UScript.INHERITED); |
| result.intersect(cpScript); |
| } else if (index == 1) { |
| // Script == Common or Inherited. Nothing to do. |
| } else { |
| result.intersect(fSpoofData.fScriptSets[index]); |
| } |
| } |
| } |
| |
| /** |
| * Scan a string to determine how many scripts it includes. Ignore characters with script=Common and |
| * scirpt=Inherited. |
| * |
| * @param text |
| * The char text to be scanned |
| * @param checkResult |
| * Optional caller provided fill-in parameter. If not null, on return it will be filled. set to the first |
| * input postion at which a second script was encountered, ignoring Common and Inherited. |
| * @return the number of (non-common,inherited) scripts encountered, clipped to a max of two. |
| * @internal |
| */ |
| int scriptScan(CharSequence text, CheckResult checkResult) { |
| int inputIdx = 0; |
| int c; |
| int scriptCount = 0; |
| int lastScript = UScript.INVALID_CODE; |
| int sc = UScript.INVALID_CODE; |
| while ((inputIdx < text.length()) && scriptCount < 2) { |
| c = Character.codePointAt(text, inputIdx); |
| inputIdx = Character.offsetByCodePoints(text, inputIdx, 1); |
| sc = UScript.getScript(c); |
| if (sc == UScript.COMMON || sc == UScript.INHERITED || sc == UScript.UNKNOWN) { |
| continue; |
| } |
| |
| // Temporary fix: fold Japanese and Korean into Han. |
| // Names are allowed to mix these scripts. |
| // A more general solution will follow later for characters that are |
| // used with multiple scripts. |
| if (sc == UScript.KATAKANA || sc == UScript.HIRAGANA || sc == UScript.HANGUL) { |
| sc = UScript.HAN; |
| } |
| if (sc != lastScript) { |
| scriptCount++; |
| lastScript = sc; |
| } |
| } |
| if (scriptCount == 2 && checkResult != null) { |
| checkResult.position = inputIdx; |
| } |
| return scriptCount; |
| } |
| |
| // Data Members |
| private int fMagic; // Internal sanity check. |
| private int fChecks; // Bit vector of checks to perform. |
| private SpoofData fSpoofData; |
| private Set<ULocale> fAllowedLocales; // The Set of allowed locales. |
| private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters. |
| |
| // for this Spoof Checker. Defaults to all chars. |
| // |
| // Confusable Mappings Data Structures |
| // |
| // For the confusable data, we are essentially implementing a map, |
| // key: a code point |
| // value: a string. Most commonly one char in length, but can be more. |
| // |
| // The keys are stored as a sorted array of 32 bit ints. |
| // bits 0-23 a code point value |
| // bits 24-31 flags |
| // 24: 1 if entry applies to SL table |
| // 25: 1 if entry applies to SA table |
| // 26: 1 if entry applies to ML table |
| // 27: 1 if entry applies to MA table |
| // 28: 1 if there are multiple entries for this code point. |
| // 29-30: length of value string, in UChars. |
| // values are (1, 2, 3, other) |
| // The key table is sorted in ascending code point order. (not on the |
| // 32 bit int value, the flag bits do not participate in the sorting.) |
| // |
| // Lookup is done by means of a binary search in the key table. |
| // |
| // The corresponding values are kept in a parallel array of 16 bit ints. |
| // If the value string is of length 1, it is literally in the value array. |
| // For longer strings, the value array contains an index into the strings |
| // table. |
| // |
| // String Table: |
| // The strings table contains all of the value strings (those of length two |
| // or greater) |
| // concatentated together into one long char (UTF-16) array. |
| // |
| // The array is arranged by length of the strings - all strings of the same |
| // length |
| // are stored together. The sections are ordered by length of the strings - |
| // all two char strings first, followed by all of the three Char strings, |
| // etc. |
| // |
| // There is no nul character or other mark between adjacent strings. |
| // |
| // String Lengths table |
| // The length of strings from 1 to 3 is flagged in the key table. |
| // For strings of length 4 or longer, the string length table provides a |
| // mapping between an index into the string table and the corresponding |
| // length. |
| // Strings of these lengths are rare, so lookup time is not an issue. |
| // Each entry consists of |
| // short index of the _last_ string with this length |
| // short the length |
| |
| // Flag bits in the Key entries |
| static final int SL_TABLE_FLAG = (1 << 24); |
| static final int SA_TABLE_FLAG = (1 << 25); |
| static final int ML_TABLE_FLAG = (1 << 26); |
| static final int MA_TABLE_FLAG = (1 << 27); |
| static final int KEY_MULTIPLE_VALUES = (1 << 28); |
| static final int KEY_LENGTH_SHIFT = 29; |
| |
| static final int getKeyLength(int x) { |
| return (((x) >> 29) & 3); |
| } |
| |
| // --------------------------------------------------------------------------------------- |
| // |
| // Raw Binary Data Formats, as loaded from the ICU data file, |
| // or as built by the builder. |
| // |
| // --------------------------------------------------------------------------------------- |
| private static class SpoofDataHeader { |
| int fMagic; // (0x8345fdef) |
| byte[] fFormatVersion = new byte[4]; // Data Format. Same as the value in |
| // class UDataInfo |
| // if there is one associated with this data. |
| int fLength; // Total lenght in bytes of this spoof data, |
| // including all sections, not just the header. |
| |
| // The following four sections refer to data representing the confusable |
| // data |
| // from the Unicode.org data from "confusables.txt" |
| |
| int fCFUKeys; // byte offset to Keys table (from SpoofDataHeader *) |
| int fCFUKeysSize; // number of entries in keys table (32 bits each) |
| |
| // TODO: change name to fCFUValues, for consistency. |
| int fCFUStringIndex; // byte offset to String Indexes table |
| int fCFUStringIndexSize; // number of entries in String Indexes table (16 bits each) |
| // (number of entries must be same as in Keys table |
| |
| int fCFUStringTable; // byte offset of String table |
| int fCFUStringTableLen; // length of string table (in 16 bit UChars) |
| |
| int fCFUStringLengths; // byte offset to String Lengths table |
| int fCFUStringLengthsSize; // number of entries in lengths table. (2 x 16 bits each) |
| |
| // The following sections are for data from confusablesWholeScript.txt |
| int fAnyCaseTrie; // byte offset to the serialized Any Case Trie |
| int fAnyCaseTrieLength; // Length (bytes) of the serialized Any Case Trie |
| |
| int fLowerCaseTrie; // byte offset to the serialized Lower Case Trie |
| int fLowerCaseTrieLength; // Length (bytes) of the serialized Lower Case Trie |
| |
| int fScriptSets; // byte offset to array of ScriptSets |
| int fScriptSetsLength; // Number of ScriptSets (24 bytes each) |
| |
| // The following sections are for data from xidmodifications.txt |
| int[] unused = new int[15]; // Padding, Room for Expansion |
| |
| public SpoofDataHeader() { |
| } |
| |
| public SpoofDataHeader(DataInputStream dis) throws IOException { |
| int i; |
| fMagic = dis.readInt(); |
| for (i = 0; i < fFormatVersion.length; i++) { |
| fFormatVersion[i] = dis.readByte(); |
| } |
| fLength = dis.readInt(); |
| fCFUKeys = dis.readInt(); |
| fCFUKeysSize = dis.readInt(); |
| fCFUStringIndex = dis.readInt(); |
| fCFUStringIndexSize = dis.readInt(); |
| fCFUStringTable = dis.readInt(); |
| fCFUStringTableLen = dis.readInt(); |
| fCFUStringLengths = dis.readInt(); |
| fCFUStringLengthsSize = dis.readInt(); |
| fAnyCaseTrie = dis.readInt(); |
| fAnyCaseTrieLength = dis.readInt(); |
| fLowerCaseTrie = dis.readInt(); |
| fLowerCaseTrieLength = dis.readInt(); |
| fScriptSets = dis.readInt(); |
| fScriptSetsLength = dis.readInt(); |
| for (i = 0; i < unused.length; i++) { |
| unused[i] = dis.readInt(); |
| } |
| } |
| |
| public void output(DataOutputStream os) throws java.io.IOException { |
| int i; |
| os.writeInt(fMagic); |
| for (i = 0; i < fFormatVersion.length; i++) { |
| os.writeByte(fFormatVersion[i]); |
| } |
| os.writeInt(fLength); |
| os.writeInt(fCFUKeys); |
| os.writeInt(fCFUKeysSize); |
| os.writeInt(fCFUStringIndex); |
| os.writeInt(fCFUStringIndexSize); |
| os.writeInt(fCFUStringTable); |
| os.writeInt(fCFUStringTableLen); |
| os.writeInt(fCFUStringLengths); |
| os.writeInt(fCFUStringLengthsSize); |
| os.writeInt(fAnyCaseTrie); |
| os.writeInt(fAnyCaseTrieLength); |
| os.writeInt(fLowerCaseTrie); |
| os.writeInt(fLowerCaseTrieLength); |
| os.writeInt(fScriptSets); |
| os.writeInt(fScriptSetsLength); |
| for (i = 0; i < unused.length; i++) { |
| os.writeInt(unused[i]); |
| } |
| } |
| } |
| |
| // ------------------------------------------------------------------------------------- |
| // SpoofData |
| // |
| // A small class that wraps the raw (was memory mapped in the C world) spoof data. |
| // Nothing in this class includes state that is specific to any particular |
| // SpoofDetector object. |
| // --------------------------------------------------------------------------------------- |
| private static class SpoofData { |
| // getDefault() - return a wrapper around the spoof data that is |
| // baked into the default ICU data. |
| // Load standard ICU spoof data. |
| public static SpoofData getDefault() throws java.io.IOException { |
| // TODO: Cache it. Lazy create, keep until cleanup. |
| InputStream is = com.ibm.icu.impl.ICUData.getRequiredStream(com.ibm.icu.impl.ICUResourceBundle.ICU_BUNDLE |
| + "/confusables.cfu"); |
| SpoofData This = new SpoofData(is); |
| return This; |
| } |
| |
| // SpoofChecker Data constructor for use from data builder. |
| // Initializes a new, empty data area that will be populated later. |
| public SpoofData() { |
| // The spoof header should already be sized to be a multiple of 16 |
| // bytes. |
| // Just in case it's not, round it up. |
| |
| fRawData = new SpoofDataHeader(); |
| |
| fRawData.fMagic = SpoofChecker.MAGIC; |
| fRawData.fFormatVersion[0] = 1; |
| fRawData.fFormatVersion[1] = 0; |
| fRawData.fFormatVersion[2] = 0; |
| fRawData.fFormatVersion[3] = 0; |
| } |
| |
| // Constructor for use when creating from prebuilt default data. |
| // A InputStream is what the ICU internal data loading functions provide. |
| public SpoofData(InputStream is) throws java.io.IOException { |
| // Seek past the ICU data header. |
| // TODO: verify that the header looks good. |
| DataInputStream dis = new DataInputStream(new BufferedInputStream(is)); |
| dis.skip(0x80); |
| assert (dis.markSupported()); |
| dis.mark(Integer.MAX_VALUE); |
| |
| fRawData = new SpoofDataHeader(dis); |
| initPtrs(dis); |
| } |
| |
| // Check raw SpoofChecker Data Version compatibility. |
| // Return true it looks good. |
| static boolean validateDataVersion(SpoofDataHeader rawData) { |
| if (rawData == null || rawData.fMagic != SpoofChecker.MAGIC || rawData.fFormatVersion[0] > 1 |
| || rawData.fFormatVersion[1] > 0) { |
| return false; |
| } |
| return true; |
| } |
| |
| // build SpoofChecker from DataInputStream |
| // read from binay data input stream |
| // initialize the pointers from this object to the raw data. |
| // Initialize the pointers to the various sections of the raw data. |
| // |
| // This function is used both during the Trie building process (multiple |
| // times, as the individual data sections are added), and |
| // during the opening of a SpoofChecker Checker from prebuilt data. |
| // |
| // The pointers for non-existent data sections (identified by an offset of |
| // 0) are set to null. |
| void initPtrs(DataInputStream dis) throws java.io.IOException { |
| int i; |
| fCFUKeys = null; |
| fCFUValues = null; |
| fCFUStringLengths = null; |
| fCFUStrings = null; |
| |
| // the binary file from C world is memory-mapped, each section of data |
| // is align-ed to 16-bytes boundary, to make the code more robust we call |
| // reset()/skip() which essensially seek() to the correct offset. |
| dis.reset(); |
| dis.skip(fRawData.fCFUKeys); |
| if (fRawData.fCFUKeys != 0) { |
| fCFUKeys = new int[fRawData.fCFUKeysSize]; |
| for (i = 0; i < fRawData.fCFUKeysSize; i++) { |
| fCFUKeys[i] = dis.readInt(); |
| } |
| } |
| |
| dis.reset(); |
| dis.skip(fRawData.fCFUStringIndex); |
| if (fRawData.fCFUStringIndex != 0) { |
| fCFUValues = new short[fRawData.fCFUStringIndexSize]; |
| for (i = 0; i < fRawData.fCFUStringIndexSize; i++) { |
| fCFUValues[i] = dis.readShort(); |
| } |
| } |
| |
| dis.reset(); |
| dis.skip(fRawData.fCFUStringTable); |
| if (fRawData.fCFUStringTable != 0) { |
| fCFUStrings = new char[fRawData.fCFUStringTableLen]; |
| for (i = 0; i < fRawData.fCFUStringTableLen; i++) { |
| fCFUStrings[i] = dis.readChar(); |
| } |
| } |
| |
| dis.reset(); |
| dis.skip(fRawData.fCFUStringLengths); |
| if (fRawData.fCFUStringLengths != 0) { |
| fCFUStringLengths = new SpoofStringLengthsElement[fRawData.fCFUStringLengthsSize]; |
| for (i = 0; i < fRawData.fCFUStringLengthsSize; i++) { |
| fCFUStringLengths[i] = new SpoofStringLengthsElement(); |
| fCFUStringLengths[i].fLastString = dis.readShort(); |
| fCFUStringLengths[i].fStrLength = dis.readShort(); |
| } |
| } |
| |
| dis.reset(); |
| dis.skip(fRawData.fAnyCaseTrie); |
| if (fAnyCaseTrie == null && fRawData.fAnyCaseTrie != 0) { |
| fAnyCaseTrie = Trie2.createFromSerialized(dis); |
| } |
| dis.reset(); |
| dis.skip(fRawData.fLowerCaseTrie); |
| if (fLowerCaseTrie == null && fRawData.fLowerCaseTrie != 0) { |
| fLowerCaseTrie = Trie2.createFromSerialized(dis); |
| } |
| |
| dis.reset(); |
| dis.skip(fRawData.fScriptSets); |
| if (fRawData.fScriptSets != 0) { |
| fScriptSets = new ScriptSet[fRawData.fScriptSetsLength]; |
| for (i = 0; i < fRawData.fScriptSetsLength; i++) { |
| fScriptSets[i] = new ScriptSet(dis); |
| } |
| } |
| } |
| |
| SpoofDataHeader fRawData; |
| |
| // Confusable data |
| int[] fCFUKeys; |
| short[] fCFUValues; |
| SpoofStringLengthsElement[] fCFUStringLengths; |
| char[] fCFUStrings; |
| |
| // Whole Script Confusable Data |
| Trie2 fAnyCaseTrie; |
| Trie2 fLowerCaseTrie; |
| ScriptSet[] fScriptSets; |
| |
| private static class SpoofStringLengthsElement { |
| short fLastString; // index in string table of last string with this length |
| short fStrLength; // Length of strings |
| } |
| |
| } |
| |
| // ------------------------------------------------------------------------------- |
| // |
| // ScriptSet - Script code bit sets. Used with the whole script confusable data. |
| // Used both at data build and at run time. |
| // Could almost be a Java BitSet, except that the input and output would |
| // be awkward. |
| // |
| // ------------------------------------------------------------------------------- |
| private static class ScriptSet { |
| public ScriptSet() { |
| } |
| |
| public ScriptSet(DataInputStream dis) throws java.io.IOException { |
| for (int j = 0; j < bits.length; j++) { |
| bits[j] = dis.readInt(); |
| } |
| } |
| |
| public void output(DataOutputStream os) throws java.io.IOException { |
| for (int i = 0; i < bits.length; i++) { |
| os.writeInt(bits[i]); |
| } |
| } |
| |
| public boolean equals(ScriptSet other) { |
| for (int i = 0; i < bits.length; i++) { |
| if (bits[i] != other.bits[i]) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| public void Union(int script) { |
| int index = script / 32; |
| int bit = 1 << (script & 31); |
| assert (index < bits.length * 4 * 4); |
| bits[index] |= bit; |
| } |
| |
| @SuppressWarnings("unused") |
| public void Union(ScriptSet other) { |
| for (int i = 0; i < bits.length; i++) { |
| bits[i] |= other.bits[i]; |
| } |
| } |
| |
| public void intersect(ScriptSet other) { |
| for (int i = 0; i < bits.length; i++) { |
| bits[i] &= other.bits[i]; |
| } |
| } |
| |
| public void intersect(int script) { |
| int index = script / 32; |
| int bit = 1 << (script & 31); |
| assert (index < bits.length * 4 * 4); |
| int i; |
| for (i = 0; i < index; i++) { |
| bits[i] = 0; |
| } |
| bits[index] &= bit; |
| for (i = index + 1; i < bits.length; i++) { |
| bits[i] = 0; |
| } |
| } |
| |
| public void setAll() { |
| for (int i = 0; i < bits.length; i++) { |
| bits[i] = 0xffffffff; |
| } |
| } |
| |
| @SuppressWarnings("unused") |
| public void resetAll() { |
| for (int i = 0; i < bits.length; i++) { |
| bits[i] = 0; |
| } |
| } |
| |
| public int countMembers() { |
| // This bit counter is good for sparse numbers of '1's, which is |
| // very much the case that we will usually have. |
| int count = 0; |
| for (int i = 0; i < bits.length; i++) { |
| int x = bits[i]; |
| while (x > 0) { |
| count++; |
| x &= (x - 1); // and off the least significant one bit. |
| } |
| } |
| return count; |
| } |
| |
| private int[] bits = new int[6]; |
| } |
| } |