|  | #!/usr/bin/python -B | 
|  | # -*- coding: utf-8 -*- | 
|  | # © 2016 and later: Unicode, Inc. and others. | 
|  | # License & terms of use: http://www.unicode.org/copyright.html | 
|  | # Copyright (c) 2009-2016 International Business Machines | 
|  | # Corporation and others. All Rights Reserved. | 
|  | # | 
|  | #   file name:  preparseucd.py | 
|  | #   encoding:   US-ASCII | 
|  | #   tab size:   8 (not used) | 
|  | #   indentation:4 | 
|  | # | 
|  | #   created on: 2011nov03 (forked from ucdcopy.py) | 
|  | #   created by: Markus W. Scherer | 
|  | # | 
|  | # Copies Unicode Character Database (UCD) files from a tree | 
|  | # of files downloaded from (for example) ftp://www.unicode.org/Public/6.1.0/ | 
|  | # to ICU's source/data/unidata/ and source/test/testdata/ | 
|  | # and modifies some of the files to make them more compact. | 
|  | # Parses them and writes unidata/ppucd.txt (PreParsed UCD) with simple syntax. | 
|  | # | 
|  | # Invoke with two command-line parameters: | 
|  | # 1. source folder with UCD & idna files | 
|  | # 2. ICU source root folder (ICU 59+ combined trunk with icu4c, icu4j, tools) | 
|  | # | 
|  | # Sample invocation: | 
|  | #   ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20120118 ~/svn.icu/trunk/src | 
|  |  | 
|  | import array | 
|  | import bisect | 
|  | import codecs | 
|  | import os | 
|  | import os.path | 
|  | import re | 
|  | import shutil | 
|  | import sys | 
|  |  | 
|  | # Unicode version ---------------------------------------------------------- *** | 
|  |  | 
|  | _ucd_version = "?" | 
|  |  | 
|  | # ISO 15924 script codes --------------------------------------------------- *** | 
|  |  | 
|  | # Script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html | 
|  | # that are not yet in the UCD. | 
|  | _scripts_only_in_iso15924 = ( | 
|  | "Afak", "Blis", "Cirt", "Cyrs", | 
|  | "Egyd", "Egyh", "Geok", | 
|  | "Hanb", "Hans", "Hant", | 
|  | "Inds", "Jamo", "Jpan", "Jurc", "Kore", "Kpel", "Latf", "Latg", "Loma", | 
|  | "Maya", "Moon", "Nkgb", "Phlv", "Roro", | 
|  | "Sara", "Syre", "Syrj", "Syrn", | 
|  | "Teng", "Visp", "Wole", "Zmth", "Zsye", "Zsym", "Zxxx" | 
|  | ) | 
|  |  | 
|  | # Properties --------------------------------------------------------------- *** | 
|  |  | 
|  | # Properties that we do not want to store in ppucd.txt. | 
|  | # Not a frozenset so that we can add aliases for simpler subsequent testing. | 
|  | _ignored_properties = set(( | 
|  | # Other_Xyz only contribute to Xyz, store only the latter. | 
|  | "OAlpha", | 
|  | "ODI", | 
|  | "OGr_Ext", | 
|  | "OIDC", | 
|  | "OIDS", | 
|  | "OLower", | 
|  | "OMath", | 
|  | "OUpper", | 
|  | # Further properties that just contribute to others. | 
|  | "CE",  # Composition_Exclusion just contributes to Full_Composition_Exclusion. | 
|  | "JSN", | 
|  | # These properties just don't seem useful. | 
|  | # They are deprecated since Unicode 6.0. | 
|  | "XO_NFC", | 
|  | "XO_NFD", | 
|  | "XO_NFKC", | 
|  | "XO_NFKD", | 
|  | # ICU does not use Unihan properties. | 
|  | "cjkAccountingNumeric", | 
|  | "cjkOtherNumeric", | 
|  | "cjkPrimaryNumeric", | 
|  | "cjkCompatibilityVariant", | 
|  | "cjkIICore", | 
|  | "cjkIRG_GSource", | 
|  | "cjkIRG_HSource", | 
|  | "cjkIRG_JSource", | 
|  | "cjkIRG_KPSource", | 
|  | "cjkIRG_KSource", | 
|  | "cjkIRG_MSource", | 
|  | "cjkIRG_TSource", | 
|  | "cjkIRG_USource", | 
|  | "cjkIRG_VSource", | 
|  | "cjkRSUnicode" | 
|  | )) | 
|  |  | 
|  | # These properties (short names) map code points to | 
|  | # strings or other unusual values (property types String or Miscellaneous) | 
|  | # that cannot be block-compressed (or would be confusing). | 
|  | _uncompressible_props = frozenset(( | 
|  | "bmg", "bpb", "cf", "Conditional_Case_Mappings", "dm", "FC_NFKC", | 
|  | "isc", "lc", "na", "na1", "Name_Alias", "NFKC_CF", | 
|  | # scx is block-compressible. | 
|  | "scf", "slc", "stc", "suc", "tc", "Turkic_Case_Folding", "uc" | 
|  | )) | 
|  |  | 
|  | # Dictionary of properties. | 
|  | # Keyed by normalized property names and aliases. | 
|  | # Each value is a tuple with | 
|  | # 0: Type of property (binary, enum, ...) | 
|  | # 1: List of aliases; short & long name followed by other aliases. | 
|  | #    The short name is "" if it is listed as "n/a" in PropertyValueAliases.txt. | 
|  | # 2: Set of short property value names. | 
|  | # 3: Dictionary of property values. | 
|  | #    For Catalog & Enumerated properties, | 
|  | #    maps each value name to a list of aliases. | 
|  | #    Empty for other types of properties. | 
|  | _properties = {} | 
|  |  | 
|  | # Dictionary of binary-property values which we store as False/True. | 
|  | # Same as the values dictionary of one of the binary properties. | 
|  | _binary_values = {} | 
|  |  | 
|  | # Dictionary of null values. | 
|  | # Keyed by short property names. | 
|  | # These are type-specific values for properties that occur in the data. | 
|  | # They are overridden by _defaults, block and code point properties. | 
|  | _null_values = {} | 
|  |  | 
|  | # Property value names for null values. | 
|  | # We do not store these in _defaults. | 
|  | _null_names = frozenset(("<none>", "NaN")) | 
|  |  | 
|  | # Dictionary of explicit default property values. | 
|  | # Keyed by short property names. | 
|  | _defaults = {"gc": "Cn"} | 
|  |  | 
|  | # _null_values overridden by explicit _defaults. | 
|  | # Initialized after parsing is done. | 
|  | _null_or_defaults = {} | 
|  |  | 
|  | # List of properties with an ICU UProperty enum. | 
|  | # Each item is an (enum, pname, values) tuple. | 
|  | # - enum: the ICU enum UProperty constant string | 
|  | # - pname: the UCD short property name | 
|  | # - values: list of (enum, vname) pairs per property value | 
|  | #   - enum: the ICU property value's enum constant string | 
|  | #   - vname: the UCD short property value name | 
|  | _icu_properties = [] | 
|  |  | 
|  | # Dictionary of short property names mapped to _icu_properties items. | 
|  | _pname_to_icu_prop = {} | 
|  |  | 
|  | _non_alnum_re = re.compile("[^a-zA-Z0-9]") | 
|  |  | 
|  | def NormPropName(pname): | 
|  | """Returns a normalized form of pname. | 
|  | Removes non-ASCII-alphanumeric characters and lowercases letters.""" | 
|  | return _non_alnum_re.sub("", pname).lower() | 
|  |  | 
|  |  | 
|  | def GetProperty(pname): | 
|  | """Returns the _properties value for the pname. | 
|  | Returns null if the property is ignored. | 
|  | Caches alternate spellings of the property name.""" | 
|  | # Try the input name. | 
|  | prop = _properties.get(pname) | 
|  | if prop != None: return prop | 
|  | if pname in _ignored_properties: return None | 
|  | # Try the normalized input name. | 
|  | norm_name = NormPropName(pname) | 
|  | prop = _properties.get(norm_name) | 
|  | if prop != None: | 
|  | _properties[pname] = prop  # Cache prop under this new name spelling. | 
|  | return prop | 
|  | elif pname in _ignored_properties: | 
|  | _ignored_properties.add(pname)  # Remember to ignore this new name spelling. | 
|  | return None | 
|  | else: | 
|  | raise NameError("unknown property %s\n" % pname) | 
|  |  | 
|  |  | 
|  | def GetShortPropertyName(pname): | 
|  | if pname in _null_values: return pname  # pname is already the short name. | 
|  | prop = GetProperty(pname) | 
|  | if not prop: return ""  # For ignored properties. | 
|  | return prop[1][0] or prop[1][1]  # Long name if no short name. | 
|  |  | 
|  |  | 
|  | def GetShortPropertyValueName(prop, vname): | 
|  | if vname in prop[2]: return vname | 
|  | values = prop[3] | 
|  | aliases = values.get(vname) | 
|  | if aliases == None: | 
|  | norm_name = NormPropName(vname) | 
|  | aliases = values.get(norm_name) | 
|  | if aliases == None: | 
|  | raise NameError("unknown value name %s for property %s\n" % | 
|  | (vname, prop[1][0])) | 
|  | values[vname] = aliases | 
|  | return aliases[0] or aliases[1]  # Long name if no short name. | 
|  |  | 
|  |  | 
|  | def NormalizePropertyValue(prop, vname): | 
|  | if prop[2]:  # Binary/Catalog/Enumerated property. | 
|  | value = GetShortPropertyValueName(prop, vname) | 
|  | if prop[0] == "Binary": | 
|  | value = value == "Y" | 
|  | if prop[1][0].endswith("ccc"): | 
|  | value = int(value) | 
|  | else: | 
|  | value = vname | 
|  | return value | 
|  |  | 
|  | # Character data ----------------------------------------------------------- *** | 
|  |  | 
|  | # Lists of NamesList h1 and h2 headings. | 
|  | # Each h1 value is a (start, end, comment) tuple. | 
|  | # Each h2 value is a (cp, comment) tuple. | 
|  | _h1 = [] | 
|  | _h2 = [] | 
|  |  | 
|  | # List of Unicode blocks. | 
|  | # Each item is a tuple of start & end code point integers | 
|  | # and a dictionary of default property values. | 
|  | _blocks = [] | 
|  |  | 
|  | # List of ranges with algorithmic names. | 
|  | # Each value is a list of [start, end, type, prefix] | 
|  | # where prefix is optional. | 
|  | _alg_names_ranges = [] | 
|  |  | 
|  | # List of Unicode character ranges and their properties, | 
|  | # stored as an inversion map with range_start & props dictionary. | 
|  | # Starts with one range for all of Unicode without any properties. | 
|  | # Setting values subdivides ranges. | 
|  | _starts = array.array('l', [0, 0x110000])  # array of int32_t | 
|  | _props = [{}, {}]  # props for 0 and 110000 | 
|  |  | 
|  | def FindRange(x): | 
|  | """ Binary search for x in the inversion map. | 
|  | Returns the smallest i where x < _starts[i]""" | 
|  | return bisect.bisect(_starts, x) - 1 | 
|  |  | 
|  |  | 
|  | def GetProps(c): | 
|  | i = FindRange(c) | 
|  | return _props[i] | 
|  |  | 
|  |  | 
|  | def UpdateProps(start, end, update): | 
|  | assert 0 <= start <= end <= 0x10ffff | 
|  | (need_to_update, do_update, u) = (update[0], update[1], update[2]) | 
|  | # Find the index i of the range in _starts that contains start. | 
|  | i = FindRange(start) | 
|  | limit = end + 1 | 
|  | # Intersect [start, limit[ with ranges in _starts. | 
|  | c_start = _starts[i] | 
|  | c_limit = _starts[i + 1] | 
|  | c_props = _props[i] | 
|  | # c_start <= start < c_limit | 
|  | if c_start < start: | 
|  | update_limit = c_limit if c_limit <= limit else limit | 
|  | if need_to_update(u, start, update_limit - 1, c_props): | 
|  | # Split off [c_start, start[ with a copy of c_props. | 
|  | i += 1 | 
|  | c_props = c_props.copy() | 
|  | _starts.insert(i, start) | 
|  | _props.insert(i, c_props) | 
|  | c_start = start | 
|  | # Modify all ranges that are fully inside [start, limit[. | 
|  | while c_limit <= limit: | 
|  | # start <= c_start < c_limit <= limit | 
|  | if need_to_update(u, c_start, c_limit - 1, c_props): | 
|  | do_update(u, c_start, c_limit - 1, c_props) | 
|  | if c_limit == 0x110000: return | 
|  | i += 1 | 
|  | c_start = c_limit | 
|  | c_limit = _starts[i + 1] | 
|  | c_props = _props[i] | 
|  | if c_start < limit and need_to_update(u, c_start, limit - 1, c_props): | 
|  | # Split off [limit, c_limit[ with a copy of c_props. | 
|  | _starts.insert(i + 1, limit) | 
|  | _props.insert(i + 1, c_props.copy()) | 
|  | # Modify [c_start, limit[ c_props. | 
|  | do_update(u, c_start, limit - 1, c_props) | 
|  |  | 
|  |  | 
|  | def NeedToSetProps(props, start, end, c_props): | 
|  | """Returns True if props is not a sub-dict of c_props.""" | 
|  | for (pname, value) in props.iteritems(): | 
|  | if pname not in c_props or value != c_props[pname]: return True | 
|  | return False | 
|  |  | 
|  |  | 
|  | def DoSetProps(props, start, end, c_props): | 
|  | c_props.update(props) | 
|  |  | 
|  |  | 
|  | def SetProps(start, end, props): | 
|  | UpdateProps(start, end, (NeedToSetProps, DoSetProps, props)) | 
|  |  | 
|  |  | 
|  | def NeedToSetAlways(nv, start, end, c_props): | 
|  | return True | 
|  |  | 
|  |  | 
|  | # For restoring boundaries after merging adjacent same-props ranges. | 
|  | def AddBoundary(x): | 
|  | """Ensure that there is a range start/limit at x.""" | 
|  | assert 0 <= x <= 0x10ffff | 
|  | i = FindRange(x) | 
|  | if _starts[i] == x: return | 
|  | # Split the range at x. | 
|  | c_start = _starts[i] | 
|  | c_limit = _starts[i + 1] | 
|  | c_props = _props[i] | 
|  | # c_start < x < c_limit | 
|  | i += 1 | 
|  | _starts.insert(i, x) | 
|  | _props.insert(i, c_props.copy()) | 
|  |  | 
|  |  | 
|  | def SetDefaultValue(pname, value): | 
|  | """Sets the property's default value. Ignores null values.""" | 
|  | prop = GetProperty(pname) | 
|  | if prop and value not in _null_names: | 
|  | value = NormalizePropertyValue(prop, value) | 
|  | if value != _null_values[prop[1][0]]: | 
|  | _defaults[prop[1][0]] = value | 
|  | SetProps(0, 0x10ffff, {prop[1][0]: value}) | 
|  |  | 
|  |  | 
|  | def SetBinaryPropertyToTrue(pname, start, end): | 
|  | prop = GetProperty(pname) | 
|  | if prop: | 
|  | assert prop[0] == "Binary" | 
|  | SetProps(start, end, {prop[1][0]: True}) | 
|  |  | 
|  |  | 
|  | def SetPropValue(prop, vname, start, end): | 
|  | value = NormalizePropertyValue(prop, vname) | 
|  | SetProps(start, end, {prop[1][0]: value}) | 
|  |  | 
|  |  | 
|  | def SetPropertyValue(pname, vname, start, end): | 
|  | prop = GetProperty(pname) | 
|  | if prop: SetPropValue(prop, vname, start, end) | 
|  |  | 
|  | # Parsing ------------------------------------------------------------------ *** | 
|  |  | 
|  | _stripped_cp_re = re.compile("([0-9a-fA-F]+)$") | 
|  | _stripped_range_re = re.compile("([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)$") | 
|  | _missing_re = re.compile("# *@missing: *0000\.\.10FFFF *; *(.+)$") | 
|  |  | 
|  | def ReadUCDLines(in_file, want_ranges=True, want_other=False, | 
|  | want_comments=False, want_missing=False): | 
|  | """Parses lines from a semicolon-delimited UCD text file. | 
|  | Strips comments, ignores empty and all-comment lines. | 
|  | Returns a tuple (type, line, ...). | 
|  | """ | 
|  | for line in in_file: | 
|  | line = line.strip() | 
|  | if not line: continue | 
|  | if line.startswith("#"):  # whole-line comment | 
|  | if want_missing: | 
|  | match = _missing_re.match(line) | 
|  | if match: | 
|  | fields = match.group(1).split(";") | 
|  | for i in xrange(len(fields)): fields[i] = fields[i].strip() | 
|  | yield ("missing", line, fields) | 
|  | continue | 
|  | if want_comments: yield ("comment", line) | 
|  | continue | 
|  | comment_start = line.find("#")  # inline comment | 
|  | if comment_start >= 0: | 
|  | line = line[:comment_start].rstrip() | 
|  | if not line: continue | 
|  | fields = line.split(";") | 
|  | for i in xrange(len(fields)): fields[i] = fields[i].strip() | 
|  | if want_ranges: | 
|  | first = fields[0] | 
|  | match = _stripped_range_re.match(first) | 
|  | if match: | 
|  | start = int(match.group(1), 16) | 
|  | end = int(match.group(2), 16) | 
|  | yield ("range", line, start, end, fields) | 
|  | continue | 
|  | match = _stripped_cp_re.match(first) | 
|  | if match: | 
|  | c = int(match.group(1), 16) | 
|  | yield ("range", line, c, c, fields) | 
|  | continue | 
|  | if want_other: | 
|  | yield ("other", line, fields) | 
|  | else: | 
|  | raise SyntaxError("unable to parse line\n  %s\n" % line) | 
|  |  | 
|  |  | 
|  | def AddBinaryProperty(short_name, long_name): | 
|  | _null_values[short_name] = False | 
|  | bin_prop = _properties["Math"] | 
|  | prop = ("Binary", [short_name, long_name], bin_prop[2], bin_prop[3]) | 
|  | _properties[short_name] = prop | 
|  | _properties[long_name] = prop | 
|  | _properties[NormPropName(short_name)] = prop | 
|  | _properties[NormPropName(long_name)] = prop | 
|  |  | 
|  |  | 
|  | def AddPOSIXBinaryProperty(name): | 
|  | # We only define a long name for ICU-specific (non-UCD) POSIX properties. | 
|  | _null_values[name] = False | 
|  | bin_prop = _properties["Math"] | 
|  | prop = ("Binary", ["", name], bin_prop[2], bin_prop[3]) | 
|  | _properties[name] = prop | 
|  | _properties[NormPropName(name)] = prop | 
|  | # This is to match UProperty UCHAR_POSIX_ALNUM etc. | 
|  | _properties["posix" + NormPropName(name)] = prop | 
|  |  | 
|  |  | 
|  | # Match a comment line like | 
|  | # PropertyAliases-6.1.0.txt | 
|  | # and extract the Unicode version. | 
|  | _ucd_version_re = re.compile("# *PropertyAliases" + | 
|  | "-([0-9]+(?:\\.[0-9]+)*)(?:d[0-9]+)?" + | 
|  | "\\.txt") | 
|  |  | 
|  | def ParsePropertyAliases(in_file): | 
|  | global _ucd_version | 
|  | prop_type_nulls = { | 
|  | "Binary": False, | 
|  | "Catalog": "??",  # Must be specified, e.g., in @missing line. | 
|  | "Enumerated": "??",  # Must be specified. | 
|  | "Numeric": "NaN", | 
|  | "String": "", | 
|  | "Miscellaneous": "" | 
|  | } | 
|  | for data in ReadUCDLines(in_file, want_ranges=False, | 
|  | want_other=True, want_comments=True): | 
|  | if data[0] == "comment": | 
|  | line = data[1] | 
|  | match = _ucd_version_re.match(line) | 
|  | if match: | 
|  | _ucd_version = match.group(1) | 
|  | else: | 
|  | words = line[1:].lstrip().split() | 
|  | if len(words) == 2 and words[1] == "Properties": | 
|  | prop_type = words[0] | 
|  | null_value = prop_type_nulls[prop_type] | 
|  | else: | 
|  | # type == "other" | 
|  | aliases = data[2] | 
|  | name = aliases[0] | 
|  | if name in _ignored_properties: | 
|  | for alias in aliases: | 
|  | _ignored_properties.add(alias) | 
|  | _ignored_properties.add(NormPropName(alias)) | 
|  | else: | 
|  | if name.endswith("ccc"): | 
|  | _null_values[name] = 0 | 
|  | else: | 
|  | _null_values[name] = null_value | 
|  | prop = (prop_type, aliases, set(), {}) | 
|  | for alias in aliases: | 
|  | _properties[alias] = prop | 
|  | _properties[NormPropName(alias)] = prop | 
|  | # Add provisional and ICU-specific properties we need. | 
|  | # We add some in support of runtime API, even if we do not write | 
|  | # data for them to ppucd.txt (e.g., lccc & tccc). | 
|  | # We add others just to represent UCD data that contributes to | 
|  | # some functionality, although Unicode has not "blessed" them | 
|  | # as separate properties (e.g., Turkic_Case_Folding). | 
|  |  | 
|  | # Turkic_Case_Folding: The 'T' mappings in CaseFolding.txt. | 
|  | name = "Turkic_Case_Folding" | 
|  | _null_values[name] = "" | 
|  | prop = ("String", [name, name], set(), {}) | 
|  | _properties[name] = prop | 
|  | _properties[NormPropName(name)] = prop | 
|  | # Conditional_Case_Mappings: SpecialCasing.txt lines with conditions. | 
|  | name = "Conditional_Case_Mappings" | 
|  | _null_values[name] = "" | 
|  | prop = ("Miscellaneous", [name, name], set(), {}) | 
|  | _properties[name] = prop | 
|  | _properties[NormPropName(name)] = prop | 
|  | # lccc = ccc of first cp in canonical decomposition. | 
|  | _null_values["lccc"] = 0 | 
|  | ccc_prop = list(_properties["ccc"]) | 
|  | ccc_prop[1] = ["lccc", "Lead_Canonical_Combining_Class"] | 
|  | prop = tuple(ccc_prop) | 
|  | _properties["lccc"] = prop | 
|  | _properties["Lead_Canonical_Combining_Class"] = prop | 
|  | _properties["leadcanonicalcombiningclass"] = prop | 
|  | # tccc = ccc of last cp in canonical decomposition. | 
|  | _null_values["tccc"] = 0 | 
|  | ccc_prop[1] = ["tccc", "Trail_Canonical_Combining_Class"] | 
|  | prop = tuple(ccc_prop) | 
|  | _properties["tccc"] = prop | 
|  | _properties["Trail_Canonical_Combining_Class"] = prop | 
|  | _properties["trailcanonicalcombiningclass"] = prop | 
|  | # Script_Extensions | 
|  | if "scx" not in _properties: | 
|  | _null_values["scx"] = "" | 
|  | prop = ("Miscellaneous", ["scx", "Script_Extensions"], set(), {}) | 
|  | _properties["scx"] = prop | 
|  | _properties["Script_Extensions"] = prop | 
|  | _properties["scriptextensions"] = prop | 
|  | # General Category as a bit mask. | 
|  | _null_values["gcm"] = "??" | 
|  | gc_prop = _properties["gc"] | 
|  | prop = ("Bitmask", ["gcm", "General_Category_Mask"], gc_prop[2], gc_prop[3]) | 
|  | _properties["gcm"] = prop | 
|  | _properties["General_Category_Mask"] = prop | 
|  | _properties["generalcategorymask"] = prop | 
|  | # Various binary properties. | 
|  | AddBinaryProperty("Sensitive", "Case_Sensitive") | 
|  | AddBinaryProperty("nfdinert", "NFD_Inert") | 
|  | AddBinaryProperty("nfkdinert", "NFKD_Inert") | 
|  | AddBinaryProperty("nfcinert", "NFC_Inert") | 
|  | AddBinaryProperty("nfkcinert", "NFKC_Inert") | 
|  | AddBinaryProperty("segstart", "Segment_Starter") | 
|  | # http://www.unicode.org/reports/tr51/#Emoji_Properties | 
|  | AddBinaryProperty("Emoji", "Emoji") | 
|  | AddBinaryProperty("EPres", "Emoji_Presentation") | 
|  | AddBinaryProperty("EMod", "Emoji_Modifier") | 
|  | AddBinaryProperty("EBase", "Emoji_Modifier_Base") | 
|  | AddBinaryProperty("EComp", "Emoji_Component") | 
|  | AddBinaryProperty("ExtPict", "Extended_Pictographic") | 
|  | # C/POSIX character classes that do not have Unicode property [value] aliases. | 
|  | # See uchar.h. | 
|  | AddPOSIXBinaryProperty("alnum") | 
|  | AddPOSIXBinaryProperty("blank") | 
|  | AddPOSIXBinaryProperty("graph") | 
|  | AddPOSIXBinaryProperty("print") | 
|  | AddPOSIXBinaryProperty("xdigit") | 
|  |  | 
|  |  | 
|  | def ParsePropertyValueAliases(in_file): | 
|  | global _binary_values | 
|  | for data in ReadUCDLines(in_file, want_ranges=False, | 
|  | want_other=True, want_missing=True): | 
|  | if data[0] == "missing": | 
|  | SetDefaultValue(data[2][0], data[2][1]) | 
|  | else: | 
|  | # type == "other" | 
|  | fields = data[2] | 
|  | pname = fields[0] | 
|  | prop = GetProperty(pname) | 
|  | if prop: | 
|  | del fields[0]  # Only the list of aliases remains. | 
|  | short_name = fields[0] | 
|  | if short_name == "n/a":  # no short name | 
|  | fields[0] = "" | 
|  | short_name = fields[1] | 
|  | prop[2].add(short_name) | 
|  | values = prop[3] | 
|  | for alias in fields: | 
|  | if alias: | 
|  | values[alias] = fields | 
|  | values[NormPropName(alias)] = fields | 
|  | if prop[0] == "Binary" and not _binary_values: | 
|  | _binary_values = values | 
|  | # Some of the @missing lines with non-null default property values | 
|  | # are in files that we do not parse; | 
|  | # either because the data for that property is easily | 
|  | # (i.e., the @missing line would be the only reason to parse such a file) | 
|  | # or because we compute the property at runtime, | 
|  | # such as the Hangul_Syllable_Type. | 
|  | if "dt" not in _defaults:  # DerivedDecompositionType.txt | 
|  | _defaults["dt"] = "None" | 
|  | if "nt" not in _defaults:  # DerivedNumericType.txt | 
|  | _defaults["nt"] = "None" | 
|  | if "hst" not in _defaults:  # HangulSyllableType.txt | 
|  | _defaults["hst"] = "NA" | 
|  | if "gc" not in _defaults:  # No @missing line in any .txt file? | 
|  | _defaults["gc"] = "Cn" | 
|  | # Copy the gc default value to gcm. | 
|  | _defaults["gcm"] = _defaults["gc"] | 
|  | # Add ISO 15924-only script codes. | 
|  | # Only for the ICU script code API, not necessary for parsing the UCD. | 
|  | script_prop = _properties["sc"] | 
|  | short_script_names = script_prop[2]  # set | 
|  | script_values = script_prop[3]  # dict | 
|  | remove_scripts = [] | 
|  | for script in _scripts_only_in_iso15924: | 
|  | if script in short_script_names: | 
|  | remove_scripts.append(script) | 
|  | else: | 
|  | short_script_names.add(script) | 
|  | # Do not invent a Unicode long script name before the UCD adds the script. | 
|  | script_list = [script, script]  # [short, long] | 
|  | script_values[script] = script_list | 
|  | # Probably not necessary because | 
|  | # we will not parse these scripts from the UCD: | 
|  | script_values[NormPropName(script)] = script_list | 
|  | if remove_scripts: | 
|  | raise ValueError( | 
|  | "remove %s from _scripts_only_in_iso15924" % remove_scripts) | 
|  |  | 
|  |  | 
|  | def ParseBlocks(in_file): | 
|  | for data in ReadUCDLines(in_file, want_missing=True): | 
|  | if data[0] == "missing": | 
|  | SetDefaultValue("blk", data[2][0]) | 
|  | else: | 
|  | # type == "range" | 
|  | (start, end, name) = (data[2], data[3], data[4][1]) | 
|  | _blocks.append((start, end, {"blk": name})) | 
|  | SetPropertyValue("blk", name, start, end) | 
|  | _blocks.sort() | 
|  | # Check for overlapping blocks. | 
|  | prev_end = -1 | 
|  | for b in _blocks: | 
|  | start = b[0] | 
|  | end = b[1] | 
|  | if prev_end >= start: | 
|  | raise ValueError( | 
|  | "block %04lX..%04lX %s overlaps with another " + | 
|  | "ending at %04lX\n  %s\n" % | 
|  | (start, end, b[2]["blk"], prev_end)) | 
|  | prev_end = end | 
|  |  | 
|  |  | 
|  | def ParseUnicodeData(in_file): | 
|  | dt_prop = GetProperty("dt") | 
|  | range_first_line = "" | 
|  | range_first = -1 | 
|  | for data in ReadUCDLines(in_file, want_missing=True): | 
|  | # type == "range" | 
|  | (line, c, end, fields) = (data[1], data[2], data[3], data[4]) | 
|  | assert c == end | 
|  | name = fields[1] | 
|  | if name.startswith("<"): | 
|  | if name.endswith(", First>"): | 
|  | if range_first >= 0: | 
|  | raise SyntaxError( | 
|  | "error: unterminated range started at\n  %s\n" % | 
|  | range_first_line) | 
|  | range_first = c | 
|  | range_first_line = line | 
|  | continue | 
|  | elif name.endswith(", Last>"): | 
|  | if range_first < 0: | 
|  | raise SyntaxError( | 
|  | "error: range end without start at\n  %s\n" % | 
|  | line) | 
|  | elif range_first > c: | 
|  | raise SyntaxError( | 
|  | "error: range start/end out of order at\n  %s\n  %s\n" % | 
|  | (range_first_line, line)) | 
|  | first_name = range_first_line.split(";")[1][1:-8] | 
|  | name = name[1:-7] | 
|  | if first_name != name: | 
|  | raise SyntaxError( | 
|  | "error: range start/end name mismatch at\n  %s\n  %s\n" % | 
|  | (range_first_line, line)) | 
|  | end = c | 
|  | c = range_first | 
|  | range_first = -1 | 
|  | # Remember algorithmic name ranges. | 
|  | if "Ideograph" in name: | 
|  | prefix = "CJK UNIFIED IDEOGRAPH-" | 
|  | if c == 0x17000: prefix = "TANGUT IDEOGRAPH-" | 
|  | _alg_names_ranges.append([c, end, "han", prefix]) | 
|  | elif name == "Hangul Syllable": | 
|  | _alg_names_ranges.append([c, end, "hangul"]) | 
|  | name = "" | 
|  | else: | 
|  | # Ignore non-names like <control>. | 
|  | name = "" | 
|  | props = {} | 
|  | if name: props["na"] = name | 
|  | props["gc"] = fields[2] | 
|  | ccc = int(fields[3]) | 
|  | if ccc: props["ccc"] = ccc | 
|  | props["bc"] = fields[4] | 
|  | # Decomposition type & mapping. | 
|  | dm = fields[5] | 
|  | if dm: | 
|  | if dm.startswith("<"): | 
|  | dt_limit = dm.index(">") | 
|  | dt = NormalizePropertyValue(dt_prop, dm[1:dt_limit]) | 
|  | dm = dm[dt_limit + 1:].lstrip() | 
|  | else: | 
|  | dt = "Can" | 
|  | props["dt"] = dt | 
|  | props["dm"] = dm | 
|  | # Numeric type & value. | 
|  | decimal = fields[6] | 
|  | digit = fields[7] | 
|  | nv = fields[8] | 
|  | if (decimal and decimal != nv) or (digit and digit != nv): | 
|  | raise SyntaxError("error: numeric values differ at\n  %s\n" % line) | 
|  | if nv: | 
|  | # Map improper fractions to proper ones. | 
|  | # U+109F7 MEROITIC CURSIVE FRACTION TWO TWELFTHS | 
|  | # .. U+109FF MEROITIC CURSIVE FRACTION TEN TWELFTHS | 
|  | if nv == "2/12": | 
|  | nv = "1/6" | 
|  | elif nv == "3/12": | 
|  | nv = "1/4" | 
|  | elif nv == "4/12": | 
|  | nv = "1/3" | 
|  | elif nv == "6/12": | 
|  | nv = "1/2" | 
|  | elif nv == "8/12": | 
|  | nv = "2/3" | 
|  | elif nv == "9/12": | 
|  | nv = "3/4" | 
|  | elif nv == "10/12": | 
|  | nv = "5/6" | 
|  | props["nv"] = nv | 
|  | props["nt"] = "De" if decimal else "Di" if digit else "Nu" | 
|  | if fields[9] == "Y": props["Bidi_M"] = True | 
|  | # ICU 49 and above does not support Unicode_1_Name any more. | 
|  | # See ticket #9013. | 
|  | # na1 = fields[10] | 
|  | # if na1: props["na1"] = na1 | 
|  | # ISO_Comment is deprecated and has no values. | 
|  | # isc = fields[11] | 
|  | # if isc: props["isc"] = isc | 
|  | # Simple case mappings. | 
|  | suc = fields[12] | 
|  | slc = fields[13] | 
|  | stc = fields[14] | 
|  | if suc: props["suc"] = suc | 
|  | if slc: props["slc"] = slc | 
|  | if stc: props["stc"] = stc | 
|  | SetProps(c, end, props) | 
|  | if range_first >= 0: | 
|  | raise SyntaxError( | 
|  | "error: unterminated range started at\n  %s\n" % | 
|  | range_first_line) | 
|  | # Hangul syllables have canonical decompositions which are not listed in UnicodeData.txt. | 
|  | SetPropertyValue("dt", "Can", 0xac00, 0xd7a3) | 
|  | _alg_names_ranges.sort() | 
|  |  | 
|  |  | 
|  | _names_h1_re = re.compile("@@\t([0-9a-fA-F]+)\t(.+?)\t([0-9a-fA-F]+)$") | 
|  | _names_h2_re = re.compile("@\t\t(.+)") | 
|  | _names_char_re = re.compile("([0-9a-fA-F]+)\t.+") | 
|  |  | 
|  | def ParseNamesList(in_file): | 
|  | pending_h2 = "" | 
|  | for line in in_file: | 
|  | line = line.strip() | 
|  | if not line: continue | 
|  | match = _names_h1_re.match(line) | 
|  | if match: | 
|  | pending_h2 = ""  # Drop a pending h2 when we get to an h1. | 
|  | start = int(match.group(1), 16) | 
|  | end = int(match.group(3), 16) | 
|  | comment = match.group(2).replace(u"\xa0", " ") | 
|  | _h1.append((start, end, comment)) | 
|  | continue | 
|  | match = _names_h2_re.match(line) | 
|  | if match: | 
|  | pending_h2 = match.group(1).replace(u"\xa0", " ") | 
|  | continue | 
|  | if pending_h2: | 
|  | match = _names_char_re.match(line) | 
|  | if match: | 
|  | c = int(match.group(1), 16) | 
|  | _h2.append((c, pending_h2)) | 
|  | pending_h2 = "" | 
|  | _h1.sort() | 
|  | _h2.sort() | 
|  |  | 
|  |  | 
|  | def ParseNamedProperties(in_file): | 
|  | """Parses a .txt file where the first column is a code point range | 
|  | and the second column is a property name. | 
|  | Sets binary properties to True, | 
|  | and other properties to the values in the third column.""" | 
|  | for data in ReadUCDLines(in_file, want_missing=True): | 
|  | if data[0] == "missing": | 
|  | SetDefaultValue(data[2][0], data[2][1]) | 
|  | else: | 
|  | # type == "range" | 
|  | if len(data[4]) == 2: | 
|  | SetBinaryPropertyToTrue(data[4][1], data[2], data[3]) | 
|  | else: | 
|  | SetPropertyValue(data[4][1], data[4][2], data[2], data[3]) | 
|  |  | 
|  |  | 
|  | def ParseOneProperty(in_file, pname): | 
|  | """Parses a .txt file where the first column is a code point range | 
|  | and the second column is the value of a known property.""" | 
|  | prop = GetProperty(pname) | 
|  | for data in ReadUCDLines(in_file, want_missing=True): | 
|  | if data[0] == "missing": | 
|  | SetDefaultValue(pname, data[2][0]) | 
|  | else: | 
|  | # type == "range" | 
|  | SetPropValue(prop, data[4][1], data[2], data[3]) | 
|  |  | 
|  |  | 
|  | def ParseBidiMirroring(in_file): ParseOneProperty(in_file, "bmg") | 
|  | def ParseDerivedAge(in_file): ParseOneProperty(in_file, "age") | 
|  | def ParseDerivedBidiClass(in_file): ParseOneProperty(in_file, "bc") | 
|  | def ParseDerivedJoiningGroup(in_file): ParseOneProperty(in_file, "jg") | 
|  | def ParseDerivedJoiningType(in_file): ParseOneProperty(in_file, "jt") | 
|  | def ParseEastAsianWidth(in_file): ParseOneProperty(in_file, "ea") | 
|  | def ParseGraphemeBreakProperty(in_file): ParseOneProperty(in_file, "GCB") | 
|  | def ParseIndicPositionalCategory(in_file): ParseOneProperty(in_file, "InPC") | 
|  | def ParseIndicSyllabicCategory(in_file): ParseOneProperty(in_file, "InSC") | 
|  | def ParseLineBreak(in_file): ParseOneProperty(in_file, "lb") | 
|  | def ParseScripts(in_file): ParseOneProperty(in_file, "sc") | 
|  | def ParseScriptExtensions(in_file): ParseOneProperty(in_file, "scx") | 
|  | def ParseSentenceBreak(in_file): ParseOneProperty(in_file, "SB") | 
|  | def ParseVerticalOrientation(in_file): ParseOneProperty(in_file, "vo") | 
|  | def ParseWordBreak(in_file): ParseOneProperty(in_file, "WB") | 
|  |  | 
|  |  | 
|  | def DoSetNameAlias(alias, start, end, c_props): | 
|  | if "Name_Alias" in c_props: | 
|  | c_props["Name_Alias"] += ',' + alias | 
|  | else: | 
|  | c_props["Name_Alias"] = alias | 
|  |  | 
|  |  | 
|  | def ParseNameAliases(in_file): | 
|  | """Parses Name_Alias from NameAliases.txt. | 
|  | A character can have multiple aliases. | 
|  |  | 
|  | In Unicode 6.0, there are two columns, | 
|  | with a name correction in the second column. | 
|  |  | 
|  | In Unicode 6.1, there are three columns. | 
|  | The second contains an alias, the third its type. | 
|  | The documented types are: | 
|  | correction, control, alternate, figment, abbreviation | 
|  |  | 
|  | This function does not sort the types, assuming they appear in this order.""" | 
|  | for data in ReadUCDLines(in_file): | 
|  | start = data[2] | 
|  | end = data[3] | 
|  | if start != end: | 
|  | raise ValueError("NameAliases.txt has an alias for a range %04lX..%04lX" % | 
|  | (start, end)) | 
|  | fields = data[4] | 
|  | if len(fields) == 2: | 
|  | alias = "correction=" + fields[1] | 
|  | else: | 
|  | alias = fields[2] + '=' + fields[1] | 
|  | update = (NeedToSetAlways, DoSetNameAlias, alias) | 
|  | UpdateProps(start, end, update) | 
|  |  | 
|  |  | 
|  | def NeedToSetNumericValue(nv, start, end, c_props): | 
|  | c_nv = c_props.get("nv") | 
|  | if c_nv == None: | 
|  | # DerivedNumericValues.txt adds a Numeric_Value. | 
|  | assert "nt" not in c_props | 
|  | return True | 
|  | if nv != c_nv: | 
|  | raise ValueError(("UnicodeData.txt has nv=%s for %04lX..%04lX " + | 
|  | "but DerivedNumericValues.txt has nv=%s") % | 
|  | (c_nv, start, end, nv)) | 
|  | return False | 
|  |  | 
|  |  | 
|  | def DoSetNumericValue(nv, start, end, c_props): | 
|  | c_props.update({"nt": "Nu", "nv": nv}) | 
|  |  | 
|  |  | 
|  | def ParseDerivedNumericValues(in_file): | 
|  | """Parses DerivedNumericValues.txt. | 
|  | For most characters, the numeric type & value were parsed previously | 
|  | from UnicodeData.txt but that does not show the values for Han characters. | 
|  | Here we check that values match those from UnicodeData.txt | 
|  | and add new ones.""" | 
|  | # Ignore the @missing line which has an incorrect number of fields, | 
|  | # and the "NaN" in the wrong field (at least in Unicode 5.1..6.1). | 
|  | # Also, "NaN" is just the Numeric null value anyway. | 
|  | for data in ReadUCDLines(in_file): | 
|  | # Conditional update to the numeric value in the 4th field. | 
|  | update = (NeedToSetNumericValue, DoSetNumericValue, data[4][3]) | 
|  | UpdateProps(data[2], data[3], update) | 
|  |  | 
|  |  | 
|  | def ParseCaseFolding(in_file): | 
|  | for data in ReadUCDLines(in_file, want_missing=True): | 
|  | if data[0] == "missing": | 
|  | assert data[2][0] == "C"  # common to scf & cf | 
|  | SetDefaultValue("scf", data[2][1]) | 
|  | SetDefaultValue("cf", data[2][1]) | 
|  | else: | 
|  | # type == "range" | 
|  | start = data[2] | 
|  | end = data[3] | 
|  | status = data[4][1] | 
|  | mapping = data[4][2] | 
|  | assert status in "CSFT" | 
|  | if status == "C": | 
|  | SetProps(start, end, {"scf": mapping, "cf": mapping}) | 
|  | elif status == "S": | 
|  | SetPropertyValue("scf", mapping, start, end) | 
|  | elif status == "F": | 
|  | SetPropertyValue("cf", mapping, start, end) | 
|  | else:  # status == "T" | 
|  | SetPropertyValue("Turkic_Case_Folding", mapping, start, end) | 
|  |  | 
|  |  | 
|  | def DoSetConditionalCaseMappings(ccm, start, end, c_props): | 
|  | if "Conditional_Case_Mappings" in c_props: | 
|  | c_props["Conditional_Case_Mappings"] += ',' + ccm | 
|  | else: | 
|  | c_props["Conditional_Case_Mappings"] = ccm | 
|  |  | 
|  |  | 
|  | def ParseSpecialCasing(in_file): | 
|  | for data in ReadUCDLines(in_file, want_missing=True): | 
|  | if data[0] == "missing": | 
|  | SetDefaultValue("lc", data[2][0]) | 
|  | SetDefaultValue("tc", data[2][1]) | 
|  | SetDefaultValue("uc", data[2][2]) | 
|  | else: | 
|  | # type == "range" | 
|  | start = data[2] | 
|  | end = data[3] | 
|  | fields = data[4] | 
|  | if len(fields) < 5 or not fields[4]: | 
|  | # Unconditional mappings. | 
|  | SetProps(start, end, {"lc": fields[1], "tc": fields[2], "uc": fields[3]}) | 
|  | else: | 
|  | # Conditional_Case_Mappings | 
|  | ccm = (fields[4] + ":lc=" + fields[1] + | 
|  | "&tc=" + fields[2] + "&uc=" + fields[3]) | 
|  | update = (NeedToSetAlways, DoSetConditionalCaseMappings, ccm) | 
|  | UpdateProps(start, end, update) | 
|  |  | 
|  |  | 
|  | def ParseBidiBrackets(in_file): | 
|  | for data in ReadUCDLines(in_file, want_missing=True): | 
|  | if data[0] == "missing": | 
|  | SetDefaultValue("bpt", data[2][1]) | 
|  | else: | 
|  | # type == "range" | 
|  | start = data[2] | 
|  | end = data[3] | 
|  | assert start == end | 
|  | mapping = data[4][1] | 
|  | bracket_type = data[4][2] | 
|  | SetProps(start, end, {"bpb": mapping, "bpt": bracket_type}) | 
|  |  | 
|  | # Postprocessing ----------------------------------------------------------- *** | 
|  |  | 
|  | def PrintedSize(pname, value): | 
|  | if isinstance(value, bool): | 
|  | if value: | 
|  | return len(pname) + 1  # ";pname" | 
|  | else: | 
|  | return len(pname) + 2  # ";-pname" | 
|  | else: | 
|  | return len(pname) + len(str(value)) + 2  # ";pname=value" | 
|  |  | 
|  |  | 
|  | def CompactBlock(b, i): | 
|  | assert b[0] == _starts[i] | 
|  | b_props = b[2]  # Normally just blk from Blocks.txt. | 
|  | # b_props["blk"] has not been canonicalized yet. | 
|  | b_props["blk"] = _props[i]["blk"] | 
|  | orig_i = i | 
|  | # Count the number of occurrences of each property's value in this block. | 
|  | # To minimize the output, count the number of assigned ranges, | 
|  | # not the number of code points. | 
|  | num_ranges = 0 | 
|  | prop_counters = {} | 
|  | if "gc" in b_props: | 
|  | b_is_unassigned = b_props["gc"] == "Cn"  # Unreachable with normal data. | 
|  | else: | 
|  | b_is_unassigned = _defaults["gc"] == "Cn"  # This is expected to be true. | 
|  | while True: | 
|  | start = _starts[i] | 
|  | if start > b[1]: break | 
|  | props = _props[i] | 
|  | if "gc" in props: | 
|  | is_unassigned = props["gc"] == "Cn" | 
|  | else: | 
|  | is_unassigned = b_is_unassigned | 
|  | if is_unassigned: | 
|  | # Compact an unassigned range inside the block and | 
|  | # mark it to be written with "unassigned". | 
|  | # It falls back to default properties, not block properties, | 
|  | # except for the blk=Block property. | 
|  | assert props["blk"] == b_props["blk"] | 
|  | del props["blk"] | 
|  | for pname in props.keys():  # .keys() is a copy so we can del props[pname]. | 
|  | if props[pname] == _null_or_defaults[pname]: del props[pname] | 
|  | # What remains are unusual default values for unassigned code points. | 
|  | # For example, bc=R or lb=ID. | 
|  | # See http://www.unicode.org/reports/tr44/#Default_Values_Table | 
|  | props["unassigned"] = True | 
|  | else: | 
|  | for (pname, value) in props.iteritems(): | 
|  | if pname in prop_counters: | 
|  | counter = prop_counters[pname] | 
|  | else: | 
|  | counter = {_null_or_defaults[pname]: num_ranges} | 
|  | prop_counters[pname] = counter | 
|  | if value in counter: | 
|  | counter[value] += 1 | 
|  | else: | 
|  | counter[value] = 1 | 
|  | # Also count default values for properties that do not occur in a range. | 
|  | for pname in prop_counters: | 
|  | if pname not in props: | 
|  | counter = prop_counters[pname] | 
|  | value = _null_or_defaults[pname] | 
|  | counter[value] += 1 | 
|  | num_ranges += 1 | 
|  | # Invariant: For each counter, the sum of counts must equal num_ranges. | 
|  | i += 1 | 
|  | # For each property that occurs within this block, | 
|  | # set the value that reduces the file size the most as a block property value. | 
|  | # This is usually the most common value. | 
|  | for (pname, counter) in prop_counters.iteritems(): | 
|  | default_value = _null_or_defaults[pname] | 
|  | default_size = PrintedSize(pname, default_value) * counter[default_value] | 
|  | max_value = None | 
|  | max_count = 0 | 
|  | max_savings = 0 | 
|  | for (value, count) in counter.iteritems(): | 
|  | if value != default_value and count > 1: | 
|  | # Does the file get smaller by setting the block default? | 
|  | # We save writing the block value as often as it occurs, | 
|  | # minus once for writing it for the block, | 
|  | # minus writing the default value instead. | 
|  | savings = PrintedSize(pname, value) * (count - 1) - default_size | 
|  | if savings > max_savings: | 
|  | max_value = value | 
|  | max_count = count | 
|  | max_savings = savings | 
|  | # Do not compress uncompressible properties, | 
|  | # with an exception for many empty-string values in a block | 
|  | # (NFCK_CF='' for tags and variation selectors). | 
|  | if (max_savings > 0 and | 
|  | ((pname not in _uncompressible_props) or | 
|  | (max_value == '' and max_count >= 12))): | 
|  | b_props[pname] = max_value | 
|  | # For each range and property, remove the default+block value | 
|  | # but set the default value if that property was not set | 
|  | # (i.e., it used to inherit the default value). | 
|  | b_defaults = _null_or_defaults.copy() | 
|  | b_defaults.update(b_props) | 
|  | i = orig_i | 
|  | while True: | 
|  | start = _starts[i] | 
|  | if start > b[1]: break | 
|  | props = _props[i] | 
|  | if "unassigned" not in props: | 
|  | # Compact an assigned range inside the block. | 
|  | for pname in prop_counters: | 
|  | if pname in props: | 
|  | if props[pname] == b_defaults[pname]: del props[pname] | 
|  | elif pname in b_props: | 
|  | # b_props only has non-default values. | 
|  | # Set the default value if it used to be inherited. | 
|  | props[pname] = _null_or_defaults[pname] | 
|  | # If there is only one assigned range, then move all of its properties | 
|  | # to the block. | 
|  | if num_ranges == 1: | 
|  | b_props.update(props) | 
|  | props.clear() | 
|  | i += 1 | 
|  | # Return the _starts index of the first range after this block. | 
|  | return i | 
|  |  | 
|  |  | 
|  | def CompactNonBlock(limit, i): | 
|  | """Remove default property values from between-block ranges.""" | 
|  | default_is_unassigned = _defaults["gc"] == "Cn"  # This is expected to be true. | 
|  | while True: | 
|  | start = _starts[i] | 
|  | if start >= limit: break | 
|  | props = _props[i] | 
|  | if "gc" in props: | 
|  | is_unassigned = props["gc"] == "Cn" | 
|  | else: | 
|  | is_unassigned = default_is_unassigned | 
|  | for pname in props.keys():  # .keys() is a copy so we can del props[pname]. | 
|  | if props[pname] == _null_or_defaults[pname]: del props[pname] | 
|  | assert "blk" not in props | 
|  | # If there are no props left, then nothing will be printed. | 
|  | # Otherwise, add "unassigned" for more obvious output. | 
|  | if props and is_unassigned: | 
|  | props["unassigned"] = True | 
|  | i += 1 | 
|  | # Return the _starts index of the first range after this block. | 
|  | return i | 
|  |  | 
|  |  | 
|  | def CompactBlocks(): | 
|  | """Optimizes block properties. | 
|  | Sets properties on blocks to the most commonly used values, | 
|  | and removes default+block values from code point properties.""" | 
|  | # Ensure that there is a boundary in _starts for each block | 
|  | # so that the simple mixing method below works. | 
|  | for b in _blocks: | 
|  | AddBoundary(b[0]) | 
|  | limit = b[1] + 1 | 
|  | if limit <= 0x10ffff: AddBoundary(limit) | 
|  | # Walk through ranges and blocks together. | 
|  | i = 0 | 
|  | for b in _blocks: | 
|  | b_start = b[0] | 
|  | if _starts[i] < b_start: | 
|  | i = CompactNonBlock(b_start, i) | 
|  | i = CompactBlock(b, i) | 
|  | CompactNonBlock(0x110000, i) | 
|  |  | 
|  | # Output ------------------------------------------------------------------- *** | 
|  |  | 
|  | def AppendRange(fields, start, end): | 
|  | if start == end: | 
|  | fields.append("%04lX" % start) | 
|  | else: | 
|  | fields.append("%04lX..%04lX" % (start, end)) | 
|  |  | 
|  |  | 
|  | def AppendProps(fields, props): | 
|  | # Sort property names (props keys) by their normalized forms | 
|  | # and output properties in that order. | 
|  | for pname in sorted(props, key=NormPropName): | 
|  | value = props[pname] | 
|  | if isinstance(value, bool): | 
|  | if not value: pname = "-" + pname | 
|  | fields.append(pname) | 
|  | else: | 
|  | fields.append("%s=%s" % (pname, value)) | 
|  |  | 
|  |  | 
|  | def WriteFieldsRangeProps(fields, start, end, props, out_file): | 
|  | AppendRange(fields, start, end) | 
|  | AppendProps(fields, props) | 
|  | out_file.write(";".join(fields)) | 
|  | out_file.write("\n") | 
|  |  | 
|  |  | 
|  | def EscapeNonASCII(s): | 
|  | i = 0 | 
|  | while i < len(s): | 
|  | c = ord(s[i]) | 
|  | if c <= 0x7f: | 
|  | i = i + 1 | 
|  | else: | 
|  | if c <= 0xffff: | 
|  | esc = u"\\u%04X" % c | 
|  | else: | 
|  | esc = u"\\U%08X" % c | 
|  | s = s[:i] + esc + s[i+1:] | 
|  | i = i + len(esc) | 
|  | return s | 
|  |  | 
|  |  | 
|  | def WritePreparsedUCD(out_file): | 
|  | out_file.write("""# Preparsed UCD generated by ICU preparseucd.py | 
|  | # Copyright (C) 1991 and later: Unicode, Inc. and others. | 
|  | # License & terms of use: http://www.unicode.org/copyright.html | 
|  | """); | 
|  | out_file.write("ucd;%s\n\n" % _ucd_version) | 
|  | # Sort property names (props keys) by their normalized forms | 
|  | # and output properties in that order. | 
|  | pnames = sorted(_null_values, key=NormPropName) | 
|  | for pname in pnames: | 
|  | prop = _properties[pname] | 
|  | out_file.write(";".join(["property", prop[0]] + prop[1])) | 
|  | out_file.write("\n") | 
|  | out_file.write("\n") | 
|  | out_file.write(";".join(["binary"] + _binary_values["N"])) | 
|  | out_file.write("\n") | 
|  | out_file.write(";".join(["binary"] + _binary_values["Y"])) | 
|  | out_file.write("\n") | 
|  | for pname in pnames: | 
|  | prop = _properties[pname] | 
|  | short_names = prop[2] | 
|  | if short_names and prop[0] != "Binary": | 
|  | for name in sorted(short_names): | 
|  | out_file.write(";".join(["value", prop[1][0]] + prop[3][name])) | 
|  | out_file.write("\n") | 
|  | out_file.write("\n") | 
|  | # Ensure that there is a boundary in _starts for each | 
|  | # range of data we mix into the output, | 
|  | # so that the simple mixing method below works. | 
|  | for b in _blocks: AddBoundary(b[0]) | 
|  | for r in _alg_names_ranges: AddBoundary(r[0]) | 
|  | for h in _h1: AddBoundary(h[0]) | 
|  | for h in _h2: AddBoundary(h[0]) | 
|  | # Write the preparsed data. ppucd.txt = preparsed UCD | 
|  | # Syntax: http://site.icu-project.org/design/props/ppucd | 
|  | WriteFieldsRangeProps(["defaults"], 0, 0x10ffff, _defaults, out_file) | 
|  | i_blocks = 0 | 
|  | i_alg = 0 | 
|  | i_h1 = 0 | 
|  | i_h2 = 0 | 
|  | b_end = -1 | 
|  | for i in xrange(len(_starts) - 1): | 
|  | start = _starts[i] | 
|  | end = _starts[i + 1] - 1 | 
|  | # Block with default properties. | 
|  | if i_blocks < len(_blocks) and start == _blocks[i_blocks][0]: | 
|  | b = _blocks[i_blocks] | 
|  | b_end = b[1] | 
|  | WriteFieldsRangeProps(["\nblock"], b[0], b_end, b[2], out_file) | 
|  | i_blocks += 1 | 
|  | # NamesList h1 heading (for [most of] a block). | 
|  | if i_h1 < len(_h1) and start == _h1[i_h1][0]: | 
|  | h = _h1[i_h1] | 
|  | out_file.write("# %04lX..%04lX %s\n" % (h[0], h[1], EscapeNonASCII(h[2]))) | 
|  | i_h1 += 1 | 
|  | # Algorithmic-names range. | 
|  | if i_alg < len(_alg_names_ranges) and start == _alg_names_ranges[i_alg][0]: | 
|  | r = _alg_names_ranges[i_alg] | 
|  | fields = ["algnamesrange"] | 
|  | AppendRange(fields, r[0], r[1]) | 
|  | fields.extend(r[2:]) | 
|  | out_file.write(";".join(fields)) | 
|  | out_file.write("\n") | 
|  | i_alg += 1 | 
|  | # NamesList h2 heading. | 
|  | if i_h2 < len(_h2) and start == _h2[i_h2][0]: | 
|  | out_file.write("# %s\n" % EscapeNonASCII(_h2[i_h2][1])) | 
|  | i_h2 += 1 | 
|  | # Code point/range data. | 
|  | props = _props[i] | 
|  | # Omit ranges with only default+block properties. | 
|  | if props: | 
|  | if start > b_end and b_end >= 0: | 
|  | # First range with values after the last block. | 
|  | # Separate it visually from the block lines. | 
|  | out_file.write("\n# No block\n") | 
|  | b_end = -1 | 
|  | if "unassigned" in props: | 
|  | # Do not output "unassigned" as a property. | 
|  | del props["unassigned"] | 
|  | line_type = "unassigned" | 
|  | else: | 
|  | line_type = "cp" | 
|  | WriteFieldsRangeProps([line_type], start, end, props, out_file) | 
|  |  | 
|  | # Write Normalizer2 input files -------------------------------------------- *** | 
|  | # Ported from gennorm/store.c. | 
|  |  | 
|  | def WriteAllCC(out_file): | 
|  | out_file.write("# Canonical_Combining_Class (ccc) values\n"); | 
|  | prev_start = 0 | 
|  | prev_cc = 0 | 
|  | for i in xrange(len(_starts)): | 
|  | start = _starts[i] | 
|  | props = _props[i] | 
|  | cc = props.get("ccc") | 
|  | if not cc: cc = 0 | 
|  | if prev_cc != cc: | 
|  | if prev_cc != 0: | 
|  | last_code_point = start - 1 | 
|  | if prev_start == last_code_point: | 
|  | out_file.write("%04X:%d\n" % (last_code_point, prev_cc)) | 
|  | else: | 
|  | out_file.write("%04X..%04X:%d\n" % | 
|  | (prev_start, last_code_point, prev_cc)) | 
|  | prev_start = start | 
|  | prev_cc = cc | 
|  |  | 
|  |  | 
|  | def HasMapping(c): | 
|  | props = GetProps(c) | 
|  | dt = props.get("dt") | 
|  | return dt and dt != "None" | 
|  |  | 
|  |  | 
|  | def HasOneWayMapping(c): | 
|  | while True: | 
|  | props = GetProps(c) | 
|  | dt = props.get("dt") | 
|  | if not dt or dt == "None": | 
|  | return False  # no mapping | 
|  | elif dt == "Can": | 
|  | # The canonical decomposition is a one-way mapping if | 
|  | # - it does not map to exactly two code points | 
|  | # - c has ccc!=0 | 
|  | # - c has the Composition_Exclusion property | 
|  | # - its starter has a one-way mapping (loop for this) | 
|  | # - its non-starter decomposes | 
|  | nfd = props["dm"].split() | 
|  | if (len(nfd) != 2 or | 
|  | props.get("ccc") or | 
|  | props.get("Comp_Ex") or | 
|  | HasMapping(int(nfd[1], 16))): | 
|  | return True | 
|  | c = int(nfd[0], 16)  # continue | 
|  | else: | 
|  | # c has a compatibility mapping. | 
|  | return True | 
|  |  | 
|  |  | 
|  | _data_file_copyright = """# Copyright (C) 2016 and later: Unicode, Inc. and others. | 
|  | # License & terms of use: http://www.unicode.org/copyright.html | 
|  | # Copyright (C) 1999-2016, International Business Machines | 
|  | # Corporation and others.  All Rights Reserved. | 
|  | # | 
|  | """ | 
|  |  | 
|  | def WriteNorm2NFCTextFile(path): | 
|  | global _data_file_copyright | 
|  | with open(os.path.join(path, "nfc.txt"), "w") as out_file: | 
|  | out_file.write( | 
|  | _data_file_copyright + """# file name: nfc.txt | 
|  | # | 
|  | # machine-generated by ICU preparseucd.py | 
|  | # | 
|  | # Complete data for Unicode NFC normalization. | 
|  |  | 
|  | * Unicode """ + _ucd_version + """ | 
|  |  | 
|  | """) | 
|  | WriteAllCC(out_file) | 
|  | out_file.write("\n# Canonical decomposition mappings\n") | 
|  | for i in xrange(len(_starts) - 1): | 
|  | start = _starts[i] | 
|  | end = _starts[i + 1] - 1 | 
|  | props = _props[i] | 
|  | dm = props.get("dm") | 
|  | if dm and dm[0] != '<' and props["dt"] == "Can": | 
|  | assert start == end | 
|  | # The Comp_Ex=Full_Composition_Exclusion property tells us | 
|  | # whether the canonical decomposition round-trips. | 
|  | separator = '>' if props.get("Comp_Ex") else '=' | 
|  | out_file.write("%04X%s%s\n" % (start, separator, dm)) | 
|  |  | 
|  |  | 
|  | def WriteNorm2NFKCTextFile(path): | 
|  | global _data_file_copyright | 
|  | with open(os.path.join(path, "nfkc.txt"), "w") as out_file: | 
|  | out_file.write( | 
|  | _data_file_copyright + """# file name: nfkc.txt | 
|  | # | 
|  | # machine-generated by ICU preparseucd.py | 
|  | # | 
|  | # Data for Unicode NFKC normalization. | 
|  | # This file contains only compatibility decomposition mappings, | 
|  | # plus those canonical decompositions that change from NFC round-trip mappings | 
|  | # to NFKC one-way mappings. | 
|  | # Use this file as the second gennorm2 input file after nfc.txt. | 
|  |  | 
|  | * Unicode """ + _ucd_version + """ | 
|  |  | 
|  | """) | 
|  | for i in xrange(len(_starts) - 1): | 
|  | start = _starts[i] | 
|  | end = _starts[i + 1] - 1 | 
|  | props = _props[i] | 
|  | dm = props.get("dm") | 
|  | if dm and dm[0] != '<': | 
|  | assert start == end | 
|  | if props["dt"] != "Can": | 
|  | # Compatibility decomposition. | 
|  | out_file.write("%04X>%s\n" % (start, dm)) | 
|  | elif not props.get("Comp_Ex") and HasOneWayMapping(start): | 
|  | # NFC round-trip mapping turns into NFKC one-way mapping. | 
|  | out_file.write("%04X>%s  # NFC round-trip, NFKC one-way\n" % | 
|  | (start, dm)) | 
|  |  | 
|  |  | 
|  | def WriteNorm2NFKC_CFTextFile(path): | 
|  | global _data_file_copyright | 
|  | with open(os.path.join(path, "nfkc_cf.txt"), "w") as out_file: | 
|  | out_file.write( | 
|  | _data_file_copyright + """# file name: nfkc_cf.txt | 
|  | # | 
|  | # machine-generated by ICU preparseucd.py | 
|  | # | 
|  | # This file contains the Unicode NFKC_CF mappings, | 
|  | # extracted from the UCD file DerivedNormalizationProps.txt, | 
|  | # and reformatted into syntax for the gennorm2 Normalizer2 data generator tool. | 
|  | # Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt. | 
|  |  | 
|  | """) | 
|  | out_file.write("* Unicode " + _ucd_version + "\n\n") | 
|  | prev_start = 0 | 
|  | prev_end = 0 | 
|  | prev_nfkc_cf = None | 
|  | for i in xrange(len(_starts) - 1): | 
|  | start = _starts[i] | 
|  | end = _starts[i + 1] - 1 | 
|  | props = _props[i] | 
|  | nfkc_cf = props.get("NFKC_CF") | 
|  | # Merge with the previous range if possible, | 
|  | # or remember this range for merging. | 
|  | if nfkc_cf == prev_nfkc_cf and (prev_end + 1) == start: | 
|  | prev_end = end | 
|  | else: | 
|  | if prev_nfkc_cf != None and (not prev_nfkc_cf or prev_nfkc_cf[0] != '<'): | 
|  | if prev_start == prev_end: | 
|  | out_file.write("%04X>%s\n" % (prev_start, prev_nfkc_cf)) | 
|  | else: | 
|  | out_file.write("%04X..%04X>%s\n" % | 
|  | (prev_start, prev_end, prev_nfkc_cf)) | 
|  | prev_start = start | 
|  | prev_end = end | 
|  | prev_nfkc_cf = nfkc_cf | 
|  |  | 
|  |  | 
|  | def WriteNorm2(path): | 
|  | WriteNorm2NFCTextFile(path) | 
|  | WriteNorm2NFKCTextFile(path) | 
|  | WriteNorm2NFKC_CFTextFile(path) | 
|  |  | 
|  | # UTS #46 Normalizer2 input file ------------------------------------------- *** | 
|  |  | 
|  | _idna_replacements = [ | 
|  | # Several versions of avoiding circular FFFD>FFFD mappings, | 
|  | # depending on the version of the input file. | 
|  | (re.compile(r"FFFD          ; disallowed"), "# FFFD (avoid circular mapping)"), | 
|  | (re.compile(r"\.\.FFFD"), "..FFFC"), | 
|  | (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"), | 
|  | # Since we switch between checking and not checking for STD3 character | 
|  | # restrictions at runtime, checking the non-LDH ASCII characters in code, | 
|  | # we treat these values here like their regular siblings. | 
|  | (re.compile(r"^([^;]+)  ; disallowed_STD3_valid"), r"# \1disallowed_STD3_valid"), | 
|  | (re.compile(r"; disallowed_STD3_mapped +; "), ">"), | 
|  | # For UTS #46, we do not care about "not valid in IDNA2008". | 
|  | (re.compile(r"; *; NV8 +"), ""), | 
|  | # ICU 63+ normalization no longer allows mappings for surrogate code points, | 
|  | # and the UTS #46 code handles them instead. | 
|  | (re.compile(r"^D800..DFFF    ; disallowed"), r"# D800..DFFF disallowed in code"), | 
|  | # Normal transformations. | 
|  | (re.compile(r"; disallowed"), ">FFFD"), | 
|  | (re.compile(r"; ignored"), ">"), | 
|  | (re.compile(r"^([^;]+)  ; valid"), r"# \1valid"), | 
|  | (re.compile(r"; mapped +; "), ">"), | 
|  | (re.compile(r"^([^;]+)  ; deviation +; "), r"# \1deviation >") | 
|  | ] | 
|  |  | 
|  | def IdnaToUTS46TextFile(s, t): | 
|  | """Turn Unicode IdnaMappingTable.txt into ICU gennorm2 source file format.""" | 
|  | # Different input/output file names. | 
|  | dest_path = os.path.dirname(t) | 
|  | t = os.path.join(dest_path, "uts46.txt") | 
|  | # TODO: With Python 2.7+, combine the two with statements into one. | 
|  | with open(s, "r") as in_file: | 
|  | with open(t, "w") as out_file: | 
|  | out_file.write("# Original file:\n") | 
|  | for line in in_file: | 
|  | orig_line = line | 
|  | if line.startswith("# For documentation, see"): | 
|  | out_file.write(line) | 
|  | out_file.write(r""" | 
|  | # ================================================ | 
|  | # This file has been reformatted into syntax for the | 
|  | # gennorm2 Normalizer2 data generator tool. | 
|  | # | 
|  | # "valid", "disallowed_STD3_valid" and "deviation" lines are commented out. | 
|  | # "mapped" and "disallowed_STD3_mapped" are changed to use the ">" mapping syntax. | 
|  | # "disallowed" lines map to U+FFFD. | 
|  | # "ignored" lines map to an empty string. | 
|  | # | 
|  | # Characters disallowed under STD3 rules are treated as valid or mapped; | 
|  | # they are handled in code. | 
|  | # Deviation characters are also handled in code. | 
|  | # | 
|  | # Use this file as the second gennorm2 input file after nfc.txt. | 
|  | # ================================================ | 
|  | """) | 
|  | continue | 
|  | if line[0] in "#\r\n": | 
|  | out_file.write(line) | 
|  | continue | 
|  | for rep in _idna_replacements: line = rep[0].sub(rep[1], line) | 
|  | # Align inline comments at column 40. | 
|  | comment_pos = line.find("#", 1) | 
|  | if comment_pos < 40: | 
|  | line = (line[:comment_pos] + ((40 - comment_pos) * ' ') + | 
|  | line[comment_pos:]) | 
|  | elif comment_pos > 40: | 
|  | space_pos = comment_pos | 
|  | while space_pos > 0 and line[space_pos - 1] == ' ': | 
|  | space_pos = space_pos - 1 | 
|  | if space_pos < 40: | 
|  | # Fewer than 40 characters before the comment: | 
|  | # Align comments at column 40. | 
|  | line = line[:40] + line[comment_pos:] | 
|  | else: | 
|  | # 40 or more characters before the comment: | 
|  | # Keep one space between contents and comment. | 
|  | line = line[:space_pos] + " " + line[comment_pos:] | 
|  | # Write the modified line. | 
|  | out_file.write(line) | 
|  | if "..FFFF" in orig_line and "..FFFC" in line: | 
|  | out_file.write("FFFE..FFFF    >FFFD\n"); | 
|  | return t | 
|  |  | 
|  | # Preprocessing ------------------------------------------------------------ *** | 
|  |  | 
|  | _strip_re = re.compile("([0-9a-fA-F]+.+?) *#.*") | 
|  | _code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;") | 
|  |  | 
|  | def CopyAndStripWithOptionalMerge(s, t, do_merge): | 
|  | # TODO: We do not seem to need the do_merge argument and logic any more. | 
|  | with open(s, "r") as in_file, open(t, "w") as out_file: | 
|  | first = -1  # First code point with first_data. | 
|  | last = -1  # Last code point with first_data. | 
|  | first_data = ""  # Common data for code points [first..last]. | 
|  | for line in in_file: | 
|  | match = _strip_re.match(line) | 
|  | if match: | 
|  | line = match.group(1) | 
|  | else: | 
|  | line = line.rstrip() | 
|  | if do_merge: | 
|  | match = _code_point_re.match(line) | 
|  | if match: | 
|  | c = int(match.group(1), 16) | 
|  | data = line[match.end() - 1:] | 
|  | else: | 
|  | c = -1 | 
|  | data = "" | 
|  | if last >= 0 and (c != (last + 1) or data != first_data): | 
|  | # output the current range | 
|  | if first == last: | 
|  | out_file.write("%04X%s\n" % (first, first_data)) | 
|  | else: | 
|  | out_file.write("%04X..%04X%s\n" % (first, last, first_data)) | 
|  | first = -1 | 
|  | last = -1 | 
|  | first_data = "" | 
|  | if c < 0: | 
|  | # no data on this line, output as is | 
|  | out_file.write(line) | 
|  | out_file.write("\n") | 
|  | else: | 
|  | # data on this line, store for possible range compaction | 
|  | if last < 0: | 
|  | # set as the first line in a possible range | 
|  | first = c | 
|  | last = c | 
|  | first_data = data | 
|  | else: | 
|  | # must be c == (last + 1) and data == first_data | 
|  | # because of previous conditions | 
|  | # continue with the current range | 
|  | last = c | 
|  | else: | 
|  | # Only strip, don't merge: just output the stripped line. | 
|  | out_file.write(line) | 
|  | out_file.write("\n") | 
|  | if do_merge and last >= 0: | 
|  | # output the last range in the file | 
|  | if first == last: | 
|  | out_file.write("%04X%s\n" % (first, first_data)) | 
|  | else: | 
|  | out_file.write("%04X..%04X%s\n" % (first, last, first_data)) | 
|  | first = -1 | 
|  | last = -1 | 
|  | first_data = "" | 
|  | out_file.flush() | 
|  | return t | 
|  |  | 
|  |  | 
|  | def CopyAndStrip(s, t): | 
|  | """Copies a file and removes comments behind data lines but not in others.""" | 
|  | return CopyAndStripWithOptionalMerge(s, t, False) | 
|  |  | 
|  |  | 
|  | def CopyAndStripAndMerge(s, t): | 
|  | """Copies and strips a file and merges lines. | 
|  |  | 
|  | Copies a file, removes comments, and | 
|  | merges lines with adjacent code point ranges and identical per-code point | 
|  | data lines into one line with range syntax. | 
|  | """ | 
|  | return CopyAndStripWithOptionalMerge(s, t, True) | 
|  |  | 
|  |  | 
|  | def CopyOnly(s, t): | 
|  | shutil.copy(s, t) | 
|  | return t | 
|  |  | 
|  |  | 
|  | def DontCopy(s, t): | 
|  | return s | 
|  |  | 
|  |  | 
|  | # Each _files value is a | 
|  | # (preprocessor, dest_folder, parser, order) tuple | 
|  | # where all fields except the preprocessor are optional. | 
|  | # After the initial preprocessing (copy/strip/merge), | 
|  | # if a parser is specified, then a tuple is added to _files_to_parse | 
|  | # at index "order" (default order 9). | 
|  | # An explicit order number is set only for files that must be parsed | 
|  | # before others. | 
|  | _files = { | 
|  | "BidiBrackets.txt": (DontCopy, ParseBidiBrackets), | 
|  | "BidiMirroring.txt": (DontCopy, ParseBidiMirroring), | 
|  | "BidiTest.txt": (CopyOnly, "testdata"), | 
|  | "Blocks.txt": (DontCopy, ParseBlocks), | 
|  | "CaseFolding.txt": (CopyOnly, ParseCaseFolding), | 
|  | "DerivedAge.txt": (DontCopy, ParseDerivedAge), | 
|  | "DerivedBidiClass.txt": (DontCopy, ParseDerivedBidiClass), | 
|  | "DerivedCoreProperties.txt": (CopyAndStrip, ParseNamedProperties), | 
|  | "DerivedJoiningGroup.txt": (DontCopy, ParseDerivedJoiningGroup), | 
|  | "DerivedJoiningType.txt": (DontCopy, ParseDerivedJoiningType), | 
|  | "DerivedNormalizationProps.txt": (CopyAndStrip, ParseNamedProperties), | 
|  | "DerivedNumericValues.txt": (DontCopy, ParseDerivedNumericValues), | 
|  | "EastAsianWidth.txt": (DontCopy, ParseEastAsianWidth), | 
|  | "emoji-data.txt": (DontCopy, ParseNamedProperties), | 
|  | "GraphemeBreakProperty.txt": (DontCopy, ParseGraphemeBreakProperty), | 
|  | "GraphemeBreakTest.txt": (CopyOnly, "testdata"), | 
|  | "IdnaTestV2.txt": (CopyOnly, "testdata"), | 
|  | "IndicPositionalCategory.txt": (DontCopy, ParseIndicPositionalCategory), | 
|  | "IndicSyllabicCategory.txt": (DontCopy, ParseIndicSyllabicCategory), | 
|  | "LineBreak.txt": (DontCopy, ParseLineBreak), | 
|  | "LineBreakTest.txt": (CopyOnly, "testdata"), | 
|  | "NameAliases.txt": (DontCopy, ParseNameAliases), | 
|  | "NamesList.txt": (DontCopy, ParseNamesList), | 
|  | "NormalizationCorrections.txt": (CopyOnly,),  # Only used in gensprep. | 
|  | "NormalizationTest.txt": (CopyAndStrip,), | 
|  | "PropertyAliases.txt": (DontCopy, ParsePropertyAliases, 0), | 
|  | "PropertyValueAliases.txt": (DontCopy, ParsePropertyValueAliases, 1), | 
|  | "PropList.txt": (DontCopy, ParseNamedProperties), | 
|  | "SentenceBreakProperty.txt": (DontCopy, ParseSentenceBreak), | 
|  | "SentenceBreakTest.txt": (CopyOnly, "testdata"), | 
|  | "Scripts.txt": (DontCopy, ParseScripts), | 
|  | "ScriptExtensions.txt": (DontCopy, ParseScriptExtensions), | 
|  | "SpecialCasing.txt": (CopyOnly, ParseSpecialCasing), | 
|  | "UnicodeData.txt": (CopyOnly, ParseUnicodeData, 2), | 
|  | "VerticalOrientation.txt": (DontCopy, ParseVerticalOrientation), | 
|  | "WordBreakProperty.txt": (DontCopy, ParseWordBreak), | 
|  | "WordBreakTest.txt": (CopyOnly, "testdata"), | 
|  | # From www.unicode.org/Public/idna/<version>/ | 
|  | "IdnaMappingTable.txt": (IdnaToUTS46TextFile, "norm2") | 
|  | } | 
|  |  | 
|  | # List of lists of files to be parsed in order. | 
|  | # Inner lists contain (basename, path, parser) tuples. | 
|  | _files_to_parse = [[], [], [], [], [], [], [], [], [], []] | 
|  |  | 
|  | # Get the standard basename from a versioned filename. | 
|  | # For example, match "UnicodeData-6.1.0d8.txt" | 
|  | # so we can turn it into "UnicodeData.txt". | 
|  | _file_version_re = re.compile("([a-zA-Z0-9]+)" + | 
|  | "-[0-9]+(?:\\.[0-9]+)*(?:d[0-9]+)?" + | 
|  | "(\\.[a-z]+)$") | 
|  |  | 
|  | def PreprocessFiles(source_files, icu4c_src_root): | 
|  | unidata_path = os.path.join(icu4c_src_root, "source", "data", "unidata") | 
|  | norm2_path = os.path.join(unidata_path, "norm2") | 
|  | testdata_path = os.path.join(icu4c_src_root, "source", "test", "testdata") | 
|  | folder_to_path = { | 
|  | "unidata": unidata_path, | 
|  | "norm2": norm2_path, | 
|  | "testdata": testdata_path | 
|  | } | 
|  | files_processed = set() | 
|  | for source_file in source_files: | 
|  | (folder, basename) = os.path.split(source_file) | 
|  | match = _file_version_re.match(basename) | 
|  | if match: | 
|  | new_basename = match.group(1) + match.group(2) | 
|  | if new_basename != basename: | 
|  | print "Removing version suffix from " + source_file | 
|  | # ... so that we can easily compare UCD files. | 
|  | new_source_file = os.path.join(folder, new_basename) | 
|  | shutil.move(source_file, new_source_file) | 
|  | basename = new_basename | 
|  | source_file = new_source_file | 
|  | if basename in _files: | 
|  | print "Preprocessing %s" % basename | 
|  | if basename in files_processed: | 
|  | raise Exception("duplicate file basename %s!" % basename) | 
|  | files_processed.add(basename) | 
|  | value = _files[basename] | 
|  | preprocessor = value[0] | 
|  | if len(value) >= 2 and isinstance(value[1], (str, unicode)): | 
|  | # The value was [preprocessor, dest_folder, ...], leave [...]. | 
|  | dest_folder = value[1] | 
|  | value = value[2:] | 
|  | else: | 
|  | # The value was [preprocessor, ...], leave [...]. | 
|  | dest_folder = "unidata" | 
|  | value = value[1:] | 
|  | dest_path = folder_to_path[dest_folder] | 
|  | if not os.path.exists(dest_path): os.makedirs(dest_path) | 
|  | dest_file = os.path.join(dest_path, basename) | 
|  | parse_file = preprocessor(source_file, dest_file) | 
|  | if value: | 
|  | order = 9 if len(value) < 2 else value[1] | 
|  | _files_to_parse[order].append((basename, parse_file, value[0])) | 
|  |  | 
|  | # Character names ---------------------------------------------------------- *** | 
|  |  | 
|  | # TODO: Turn this script into a module that | 
|  | # a) gives access to the parsed data | 
|  | # b) has a PreparseUCD(ucd_root, icu4c_src_root) function | 
|  | # c) has a ParsePreparsedUCD(filename) function | 
|  | # d) has a WritePreparsedUCD(filename) function | 
|  | # and then use it from a new script for names. | 
|  | # Some more API: | 
|  | # - generator GetRangesAndProps() -> (start, end, props)* | 
|  |  | 
|  | def IncCounter(counters, key, inc=1): | 
|  | if key in counters: | 
|  | counters[key] += inc | 
|  | else: | 
|  | counters[key] = inc | 
|  |  | 
|  |  | 
|  | endings = ( | 
|  | # List PHASE- before LETTER for BAMUM LETTER PHASE-xyz. | 
|  | "PHASE-", | 
|  | "LETTER ", "LIGATURE ", "CHARACTER ", "SYLLABLE ", | 
|  | "CHOSEONG ", "JUNGSEONG ", "JONGSEONG ", | 
|  | "SYLLABICS ", "IDEOGRAPH ", "IDEOGRAPH-", "IDEOGRAM ", "MONOGRAM ", | 
|  | "ACROPHONIC ", "HIEROGLYPH ", | 
|  | "DIGIT ", "NUMBER ", "NUMERAL ", "FRACTION ", | 
|  | "PUNCTUATION ", "SIGN ", "SYMBOL ", | 
|  | "TILE ", "CARD ", "FACE ", | 
|  | "ACCENT ", "POINT ", | 
|  | # List SIGN before VOWEL to catch "vowel sign". | 
|  | "VOWEL ", "TONE ", "RADICAL ", | 
|  | # For names of math symbols, | 
|  | # e.g., MATHEMATICAL BOLD ITALIC CAPITAL A | 
|  | "SCRIPT ", "FRAKTUR ", "MONOSPACE ", | 
|  | "ITALIC ", "BOLD ", "DOUBLE-STRUCK ", "SANS-SERIF ", | 
|  | "INITIAL ", "TAILED ", "STRETCHED ", "LOOPED ", | 
|  | # BRAILLE PATTERN DOTS-xyz | 
|  | "DOTS-", | 
|  | "SELECTOR ", "SELECTOR-" | 
|  | ) | 
|  |  | 
|  | def SplitName(name, tokens): | 
|  | start = 0 | 
|  | for e in endings: | 
|  | i = name.find(e) | 
|  | if i >= 0: | 
|  | start = i + len(e) | 
|  | token = name[:start] | 
|  | IncCounter(tokens, token) | 
|  | break | 
|  | for i in xrange(start, len(name)): | 
|  | c = name[i] | 
|  | if c == ' ' or c == '-': | 
|  | token = name[start:i + 1] | 
|  | IncCounter(tokens, token) | 
|  | start = i + 1 | 
|  | IncCounter(tokens, name[start:]) | 
|  |  | 
|  |  | 
|  | def PrintNameStats(): | 
|  | # TODO: This name analysis code is out of date. | 
|  | # It needs to consider the multi-type Name_Alias values. | 
|  | name_pnames = ("na", "na1", "Name_Alias") | 
|  | counts = {} | 
|  | for pname in name_pnames: | 
|  | counts[pname] = 0 | 
|  | total_lengths = counts.copy() | 
|  | max_length = 0 | 
|  | max_per_cp = 0 | 
|  | name_chars = set() | 
|  | num_digits = 0 | 
|  | token_counters = {} | 
|  | char_counters = {} | 
|  | for i in xrange(len(_starts) - 1): | 
|  | start = _starts[i] | 
|  | # end = _starts[i + 1] - 1 | 
|  | props = _props[i] | 
|  | per_cp = 0 | 
|  | for pname in name_pnames: | 
|  | if pname in props: | 
|  | counts[pname] += 1 | 
|  | name = props[pname] | 
|  | total_lengths[pname] += len(name) | 
|  | name_chars |= set(name) | 
|  | if len(name) > max_length: max_length = len(name) | 
|  | per_cp += len(name) + 1 | 
|  | if per_cp > max_per_cp: max_per_cp = per_cp | 
|  | tokens = SplitName(name, token_counters) | 
|  | for c in name: | 
|  | if c in "0123456789": num_digits += 1 | 
|  | IncCounter(char_counters, c) | 
|  | print | 
|  | for pname in name_pnames: | 
|  | print ("'%s' character names: %d / %d bytes" % | 
|  | (pname, counts[pname], total_lengths[pname])) | 
|  | print "%d total bytes in character names" % sum(total_lengths.itervalues()) | 
|  | print ("%d name-characters: %s" % | 
|  | (len(name_chars), "".join(sorted(name_chars)))) | 
|  | print "%d digits 0-9" % num_digits | 
|  | count_chars = [(count, c) for (c, count) in char_counters.iteritems()] | 
|  | count_chars.sort(reverse=True) | 
|  | for cc in count_chars: | 
|  | print "name-chars: %6d * '%s'" % cc | 
|  | print "max. name length: %d" % max_length | 
|  | print "max. length of all (names+NUL) per cp: %d" % max_per_cp | 
|  |  | 
|  | token_lengths = sum([len(t) + 1 for t in token_counters]) | 
|  | print ("%d total tokens, %d bytes with NUL" % | 
|  | (len(token_counters), token_lengths)) | 
|  |  | 
|  | counts_tokens = [] | 
|  | for (token, count) in token_counters.iteritems(): | 
|  | # If we encode a token with a 1-byte code, then we save len(t)-1 bytes each time | 
|  | # but have to store the token string itself with a length or terminator byte, | 
|  | # plus a 2-byte entry in an token index table. | 
|  | savings = count * (len(token) - 1) - (len(token) + 1 + 2) | 
|  | if savings > 0: | 
|  | counts_tokens.append((savings, count, token)) | 
|  | counts_tokens.sort(reverse=True) | 
|  | print "%d tokens might save space with 1-byte codes" % len(counts_tokens) | 
|  |  | 
|  | # Codes=bytes, 40 byte values for name_chars. | 
|  | # That leaves 216 units for 1-byte tokens or lead bytes of 2-byte tokens. | 
|  | # Make each 2-byte token the token string index itself, rather than | 
|  | # and index into a string index table. | 
|  | # More lead bytes but also more savings. | 
|  | num_units = 256 | 
|  | max_lead = (token_lengths + 255) / 256 | 
|  | max_token_units = num_units - len(name_chars) | 
|  | results = [] | 
|  | for num_lead in xrange(min(max_lead, max_token_units) + 1): | 
|  | max1 = max_token_units - num_lead | 
|  | ct = counts_tokens[:max1] | 
|  | tokens1 = set([t for (s, c, t) in ct]) | 
|  | for (token, count) in token_counters.iteritems(): | 
|  | if token in tokens1: continue | 
|  | # If we encode a token with a 2-byte code, then we save len(t)-2 bytes each time | 
|  | # but have to store the token string itself with a length or terminator byte. | 
|  | savings = count * (len(token) - 2) - (len(token) + 1) | 
|  | if savings > 0: | 
|  | ct.append((savings, count, token)) | 
|  | ct.sort(reverse=True) | 
|  | # A 2-byte-code-token index cannot be limit_t_lengths or higher. | 
|  | limit_t_lengths = num_lead * 256 | 
|  | token2_index = 0 | 
|  | for i in xrange(max1, len(ct)): | 
|  | if token2_index >= limit_t_lengths: | 
|  | del ct[i:] | 
|  | break | 
|  | token2_index += len(ct[i][2]) + 1 | 
|  | cumul_savings = sum([s for (s, c, t) in ct]) | 
|  | # print ("%2d 1-byte codes: %4d tokens might save %6d bytes" % | 
|  | #        (max1, len(ct), cumul_savings)) | 
|  | results.append((cumul_savings, max1, ct)) | 
|  | best = max(results)  # (cumul_savings, max1, ct) | 
|  |  | 
|  | max1 = best[1] | 
|  | print ("maximum savings: %d bytes with %d 1-byte codes & %d lead bytes" % | 
|  | (best[0], max1, max_token_units - max1)) | 
|  | counts_tokens = best[2] | 
|  | cumul_savings = 0 | 
|  | for i in xrange(len(counts_tokens)): | 
|  | n = 1 if i < max1 else 2 | 
|  | i1 = i + 1 | 
|  | t = counts_tokens[i] | 
|  | cumul_savings += t[0] | 
|  | if i1 <= 250 or (i1 % 100) == 0 or i1 == len(counts_tokens): | 
|  | print (("%04d. cumul. %6d bytes save %6d bytes from " + | 
|  | "%5d * %d-byte token for %2d='%s'") % | 
|  | (i1, cumul_savings, t[0], t[1], n, len(t[2]), t[2])) | 
|  |  | 
|  | # ICU API ------------------------------------------------------------------ *** | 
|  |  | 
|  | # Sample line to match: | 
|  | #    UCHAR_UNIFIED_IDEOGRAPH=29, | 
|  | _uchar_re = re.compile( | 
|  | " *(UCHAR_[0-9A-Z_]+) *= *(?:[0-9]+|0x[0-9a-fA-F]+),") | 
|  |  | 
|  | # Sample line to match: | 
|  | #    /** Zs @stable ICU 2.0 */ | 
|  | _gc_comment_re = re.compile(" */\*\* *([A-Z][a-z]) ") | 
|  |  | 
|  | # Sample line to match: | 
|  | #    U_SPACE_SEPARATOR         = 12, | 
|  | _gc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,") | 
|  |  | 
|  | # Sample line to match: | 
|  | #    /** L @stable ICU 2.0 */ | 
|  | _bc_comment_re = re.compile(" */\*\* *([A-Z]{1,3}) ") | 
|  |  | 
|  | # Sample line to match: | 
|  | #    U_LEFT_TO_RIGHT               = 0, | 
|  | _bc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,") | 
|  |  | 
|  | # Sample line to match: | 
|  | #    UBLOCK_CYRILLIC =9, | 
|  | _ublock_re = re.compile(" *(UBLOCK_[0-9A-Z_]+) *= *[0-9]+,") | 
|  |  | 
|  | # Sample line to match: | 
|  | #    U_EA_AMBIGUOUS, | 
|  | _prop_and_value_re = re.compile( | 
|  | " *(U_(BPT|DT|EA|GCB|HST|INPC|INSC|LB|JG|JT|NT|SB|VO|WB)_([0-9A-Z_]+))") | 
|  |  | 
|  | # Sample line to match if it has matched _prop_and_value_re | 
|  | # (we want to exclude aliases): | 
|  | #    U_JG_HAMZA_ON_HEH_GOAL=U_JG_TEH_MARBUTA_GOAL, | 
|  | _prop_and_alias_re = re.compile(" *U_[0-9A-Z_]+ *= *U") | 
|  |  | 
|  | def ParseUCharHeader(icu4c_src_root): | 
|  | uchar_path = os.path.join(icu4c_src_root, "source", | 
|  | "common", "unicode", "uchar.h") | 
|  | with open(uchar_path, "r") as uchar_file: | 
|  | mode = ""  # Mode string (=pname) during context-sensitive parsing. | 
|  | comment_value = ""  # Property value from a comment preceding an enum. | 
|  | # Note: The enum UProperty is first in uchar.h, before the enums for values. | 
|  | for line in uchar_file: | 
|  | # Parse some enums via context-sensitive "modes". | 
|  | # Necessary because the enum constant names do not contain | 
|  | # enough information. | 
|  | if "enum UCharCategory" in line: | 
|  | mode = "gc" | 
|  | comment_value = "" | 
|  | continue | 
|  | if mode == "gc": | 
|  | if line.startswith("}"): | 
|  | mode = "" | 
|  | continue | 
|  | match = _gc_comment_re.match(line) | 
|  | if match: | 
|  | comment_value = match.group(1) | 
|  | continue | 
|  | match = _gc_re.match(line) | 
|  | if match and comment_value: | 
|  | gc_enum = match.group(1) | 
|  | prop = _properties["gc"] | 
|  | vname = GetShortPropertyValueName(prop, comment_value) | 
|  | icu_values = _pname_to_icu_prop["gc"][2] | 
|  | icu_values.append((gc_enum, vname)) | 
|  | comment_value = "" | 
|  | continue | 
|  | if "enum UCharDirection {" in line: | 
|  | mode = "bc" | 
|  | comment_value = "" | 
|  | continue | 
|  | if mode == "bc": | 
|  | if line.startswith("}"): | 
|  | mode = "" | 
|  | continue | 
|  | match = _bc_comment_re.match(line) | 
|  | if match: | 
|  | comment_value = match.group(1) | 
|  | continue | 
|  | match = _bc_re.match(line) | 
|  | if match and comment_value: | 
|  | bc_enum = match.group(1) | 
|  | prop = _properties["bc"] | 
|  | vname = GetShortPropertyValueName(prop, comment_value) | 
|  | icu_values = _pname_to_icu_prop["bc"][2] | 
|  | icu_values.append((bc_enum, vname)) | 
|  | comment_value = "" | 
|  | continue | 
|  | # No mode, parse enum constants whose names contain | 
|  | # enough information to parse without requiring context. | 
|  | match = _uchar_re.match(line) | 
|  | if match: | 
|  | prop_enum = match.group(1) | 
|  | if prop_enum.endswith("_LIMIT"): | 
|  | # Ignore "UCHAR_BINARY_LIMIT=57," etc. | 
|  | continue | 
|  | pname = GetShortPropertyName(prop_enum[6:]) | 
|  | icu_prop = (prop_enum, pname, []) | 
|  | _icu_properties.append(icu_prop) | 
|  | _pname_to_icu_prop[pname] = icu_prop | 
|  | continue | 
|  | match = _ublock_re.match(line) | 
|  | if match: | 
|  | prop_enum = match.group(1) | 
|  | if prop_enum == "UBLOCK_COUNT": | 
|  | continue | 
|  | prop = _properties["blk"] | 
|  | vname = GetShortPropertyValueName(prop, prop_enum[7:]) | 
|  | icu_values = _pname_to_icu_prop["blk"][2] | 
|  | icu_values.append((prop_enum, vname)) | 
|  | continue | 
|  | match = _prop_and_value_re.match(line) | 
|  | if match: | 
|  | (prop_enum, vname) = match.group(1, 3) | 
|  | if vname == "COUNT" or _prop_and_alias_re.match(line): | 
|  | continue | 
|  | pname = GetShortPropertyName(match.group(2)) | 
|  | prop = _properties[pname] | 
|  | vname = GetShortPropertyValueName(prop, vname) | 
|  | icu_values = _pname_to_icu_prop[pname][2] | 
|  | icu_values.append((prop_enum, vname)) | 
|  | # ccc, lccc, tccc use their numeric values as "enum" values. | 
|  | # In the UCD data, these numeric values are the first value names, | 
|  | # followed by the short & long value names. | 
|  | # List the ccc values in numeric order. | 
|  | prop = _properties["ccc"] | 
|  | icu_values = _pname_to_icu_prop["ccc"][2] | 
|  | for ccc in sorted([int(name) for name in prop[2]]): | 
|  | icu_values.append((ccc, str(ccc))) | 
|  | _pname_to_icu_prop["lccc"][2].extend(icu_values)  # Copy ccc -> lccc. | 
|  | _pname_to_icu_prop["tccc"][2].extend(icu_values)  # Copy ccc -> tccc. | 
|  |  | 
|  | # No need to parse predictable General_Category_Mask enum constants. | 
|  | # Just define them in ASCII order. | 
|  | prop = _properties["gcm"] | 
|  | icu_values = _pname_to_icu_prop["gcm"][2] | 
|  | for vname in sorted(prop[2]): | 
|  | icu_values.append(("U_GC_" + vname.upper() + "_MASK", vname)) | 
|  | # Hardcode known values for the normalization quick check properties, | 
|  | # see unorm2.h for the UNormalizationCheckResult enum. | 
|  | icu_values = _pname_to_icu_prop["NFC_QC"][2] | 
|  | icu_values.append(("UNORM_NO", "N")) | 
|  | icu_values.append(("UNORM_YES", "Y")) | 
|  | icu_values.append(("UNORM_MAYBE", "M")) | 
|  | _pname_to_icu_prop["NFKC_QC"][2].extend(icu_values)  # Copy NFC -> NFKC. | 
|  | # No "maybe" values for NF[K]D. | 
|  | icu_values = _pname_to_icu_prop["NFD_QC"][2] | 
|  | icu_values.append(("UNORM_NO", "N")) | 
|  | icu_values.append(("UNORM_YES", "Y")) | 
|  | _pname_to_icu_prop["NFKD_QC"][2].extend(icu_values)  # Copy NFD -> NFKD. | 
|  |  | 
|  |  | 
|  | # Sample line to match: | 
|  | #    USCRIPT_LOMA   = 139,/* Loma */ | 
|  | _uscript_re = re.compile( | 
|  | " *(USCRIPT_[A-Z_]+) *= *[0-9]+ *, */\* *([A-Z][a-z]{3}) *\*/") | 
|  |  | 
|  | def ParseUScriptHeader(icu4c_src_root): | 
|  | uscript_path = os.path.join(icu4c_src_root, "source", | 
|  | "common", "unicode", "uscript.h") | 
|  | icu_values = _pname_to_icu_prop["sc"][2] | 
|  | with open(uscript_path, "r") as uscript_file: | 
|  | for line in uscript_file: | 
|  | match = _uscript_re.match(line) | 
|  | if match: | 
|  | (script_enum, script_code) = match.group(1, 2) | 
|  | icu_values.append((script_enum, script_code)) | 
|  |  | 
|  |  | 
|  | def CheckPNamesData(): | 
|  | """Checks that every ICU property has a full set of value enum constants, | 
|  | and that the _icu_properties value names map back to the UCD.""" | 
|  | missing_enums = [] | 
|  | for (p_enum, pname, values) in _icu_properties: | 
|  | prop = _properties[pname] | 
|  | vnames = set(prop[2])  # Modifiable copy of the set of short value names. | 
|  | for (v_enum, vname) in values: | 
|  | if vname not in vnames: | 
|  | raise ValueError("%s = %s (uchar.h %s) not in the UCD\n" % | 
|  | (pname, vname, v_enum)) | 
|  | vnames.remove(vname) | 
|  | # Exceptions to the all-values check: | 
|  | # - ICU does not have specific enum values for binary No/Yes. | 
|  | # - ICU represents Age values via UVersionInfo rather than enum constants. | 
|  | # - gc: ICU enum UCharCategory only has the single-category values. | 
|  | #       (ICU's gcm property has all of the UCD gc property values.) | 
|  | if vnames and not (prop[0] == "Binary" or pname in ("age", "gc")): | 
|  | missing_enums.append((pname, vnames)) | 
|  | if missing_enums: | 
|  | raise ValueError( | 
|  | "missing uchar.h enum constants for some property values: %s" % | 
|  | missing_enums) | 
|  |  | 
|  |  | 
|  | def WritePNamesDataHeader(out_path): | 
|  | with open(out_path, "w") as out_file: | 
|  | out_file.write("""// © 2016 and later: Unicode, Inc. and others. | 
|  | // License & terms of use: http://www.unicode.org/copyright.html | 
|  | /** | 
|  | * Copyright (C) 2002-2016, International Business Machines Corporation and | 
|  | * others. All Rights Reserved. | 
|  | * | 
|  | * machine-generated by: icu/tools/unicode/py/preparseucd.py | 
|  | */ | 
|  |  | 
|  | """) | 
|  |  | 
|  | # Note: The uchar.h & uscript.h parsers store the ICU Unicode properties | 
|  | # and values in the order of their definition, | 
|  | # and this function writes them in that order. | 
|  | # Since the ICU API constants are stable and new values are only | 
|  | # appended at the end | 
|  | # (new properties are added at the end of each binary/enum/... range), | 
|  | # the output is stable as well. | 
|  | # When a property or value constant is renamed, | 
|  | # it only changes the name itself in the output; | 
|  | # it does not move in the output since there is no sorting. | 
|  | # This minimizes diffs and assists with reviewing and evaluating updates. | 
|  |  | 
|  | version = _ucd_version.split('.') | 
|  | while len(version) < 4: version.append("0") | 
|  | out_file.write("#define UNICODE_VERSION { %s }\n\n" % ", ".join(version)) | 
|  |  | 
|  | # Count the maximum number of aliases for any property or value. | 
|  | # We write the final value at the end. | 
|  | max_aliases = max(len(_binary_values["N"]), len(_binary_values["Y"])) | 
|  |  | 
|  | # Write an array of "binprop" Value object initializers | 
|  | # with the value aliases shared among all binary properties. | 
|  | out_file.write("static const Value VALUES_binprop[2] = {\n") | 
|  | out_file.write('    Value(0, "%s"),\n' % " ".join(_binary_values["N"])) | 
|  | out_file.write('    Value(1, "%s"),\n' % " ".join(_binary_values["Y"])) | 
|  | out_file.write("};\n\n") | 
|  |  | 
|  | # For each property with named values, write an array of | 
|  | # Value object initializers with the value enum and the aliases. | 
|  | for (p_enum, pname, values) in _icu_properties: | 
|  | prop = _properties[pname] | 
|  | aliases = prop[1] | 
|  | if len(aliases) > max_aliases: max_aliases = len(aliases) | 
|  | if not values: continue | 
|  | out_file.write("static const Value VALUES_%s[%d] = {\n" % | 
|  | (pname, len(values))) | 
|  | for (v_enum, vname) in values: | 
|  | aliases = _properties[pname][3][vname] | 
|  | # ccc, lccc, tccc: Omit the numeric strings from the aliases. | 
|  | # (See the comment about ccc in the PropertyValueAliases.txt header.) | 
|  | if pname.endswith("ccc"): aliases = aliases[1:] | 
|  | if len(aliases) > max_aliases: max_aliases = len(aliases) | 
|  | cast = "(int32_t)" if pname == "gcm" else "" | 
|  | out_file.write('    Value(%s%s, "%s"),\n' % | 
|  | (cast, v_enum, " ".join(aliases))) | 
|  | out_file.write("};\n\n") | 
|  |  | 
|  | # For each property, write a Property object initializer | 
|  | # with the property enum, its aliases, and a reference to its values. | 
|  | out_file.write("static const Property PROPERTIES[%d] = {\n" % | 
|  | len(_icu_properties)) | 
|  | for (enum, pname, values) in _icu_properties: | 
|  | prop = _properties[pname] | 
|  | aliases = " ".join(prop[1]) | 
|  | if prop[0] == "Binary": | 
|  | out_file.write('    Property(%s, "%s"),\n' % (enum, aliases)) | 
|  | elif values:  # Property with named values. | 
|  | out_file.write('    Property(%s, "%s", VALUES_%s, %d),\n' % | 
|  | (enum, aliases, pname, len(values))) | 
|  | else: | 
|  | out_file.write('    Property(%s, "%s"),\n' % (enum, aliases)) | 
|  | out_file.write("};\n\n") | 
|  |  | 
|  | out_file.write("const int32_t MAX_ALIASES = %d;\n" % max_aliases) | 
|  |  | 
|  | # main() ------------------------------------------------------------------- *** | 
|  |  | 
|  | def main(): | 
|  | global _null_or_defaults | 
|  | only_ppucd = False | 
|  | if len(sys.argv) == 3: | 
|  | (ucd_root, icu_src_root) = sys.argv[1:3] | 
|  | ppucd_path = None | 
|  | elif len(sys.argv) == 4 and sys.argv[2] == "--only_ppucd": | 
|  | # For debugging: | 
|  | # preparseucd.py  path/to/UCD/root  --only_ppucd  path/to/ppucd/outputfile | 
|  | ucd_root = sys.argv[1] | 
|  | ppucd_path = sys.argv[3] | 
|  | only_ppucd = True | 
|  | icu_src_root = "/tmp/ppucd" | 
|  | else: | 
|  | print ("Usage: %s  path/to/UCD/root  path/to/ICU/src/root" % sys.argv[0]) | 
|  | return | 
|  | icu4c_src_root = os.path.join(icu_src_root, "icu4c") | 
|  | icu_tools_root = os.path.join(icu_src_root, "tools") | 
|  | source_files = [] | 
|  | for root, dirs, files in os.walk(ucd_root): | 
|  | for file in files: | 
|  | source_files.append(os.path.join(root, file)) | 
|  | PreprocessFiles(source_files, icu4c_src_root) | 
|  | # Parse the processed files in a particular order. | 
|  | for files in _files_to_parse: | 
|  | for (basename, path, parser) in files: | 
|  | print "Parsing %s" % basename | 
|  | value = _files[basename] | 
|  | # Unicode data files are in UTF-8. | 
|  | charset = "UTF-8" | 
|  | if basename == "NamesList.txt": | 
|  | # The NamesList used to be in Latin-1 before Unicode 6.2. | 
|  | numeric_ucd_version = [int(field) for field in _ucd_version.split('.')] | 
|  | if numeric_ucd_version < [6, 2]: charset = "ISO-8859-1" | 
|  | in_file = codecs.open(path, "r", charset) | 
|  | with in_file: | 
|  | parser(in_file) | 
|  | _null_or_defaults = _null_values.copy() | 
|  | _null_or_defaults.update(_defaults) | 
|  | # Every Catalog and Enumerated property must have a default value, | 
|  | # from a @missing line. "nv" = "null value". | 
|  | pnv = [pname for (pname, nv) in _null_or_defaults.iteritems() if nv == "??"] | 
|  | if pnv: | 
|  | raise Exception("no default values (@missing lines) for " + | 
|  | "some Catalog or Enumerated properties: %s " % pnv) | 
|  | unidata_path = os.path.join(icu4c_src_root, "source", "data", "unidata") | 
|  | if not only_ppucd: | 
|  | # Write Normalizer2 input text files. | 
|  | # Do this before compacting the data so that we need not handle fallbacks. | 
|  | norm2_path = os.path.join(unidata_path, "norm2") | 
|  | if not os.path.exists(norm2_path): os.makedirs(norm2_path) | 
|  | WriteNorm2(norm2_path) | 
|  | # Optimize block vs. cp properties. | 
|  | CompactBlocks() | 
|  | # Write the ppucd.txt output file. | 
|  | # Use US-ASCII so that ICU tests can parse it in the platform charset, | 
|  | # which may be EBCDIC. | 
|  | # Fix up non-ASCII data (NamesList.txt headings) to fit. | 
|  | if not ppucd_path: | 
|  | ppucd_path = os.path.join(unidata_path, "ppucd.txt") | 
|  | with codecs.open(ppucd_path, "w", "US-ASCII") as out_file: | 
|  | WritePreparsedUCD(out_file) | 
|  | out_file.flush() | 
|  |  | 
|  | # TODO: PrintNameStats() | 
|  |  | 
|  | if only_ppucd: return | 
|  |  | 
|  | # ICU data for property & value names API | 
|  | ParseUCharHeader(icu4c_src_root) | 
|  | ParseUScriptHeader(icu4c_src_root) | 
|  | CheckPNamesData() | 
|  | genprops_path = os.path.join(icu_tools_root, "unicode", "c", "genprops") | 
|  | if not os.path.exists(genprops_path): os.makedirs(genprops_path) | 
|  | out_path = os.path.join(genprops_path, "pnames_data.h") | 
|  | WritePNamesDataHeader(out_path) | 
|  |  | 
|  |  | 
|  | if __name__ == "__main__": | 
|  | main() |