source/tools/genpname/preparse.pl - external/github.com/unicode-org/icu - Git at Google

 #!/bin/perl -w
 #*******************************************************************
 # COPYRIGHT:
 # Copyright (c) 2002-2005, International Business Machines Corporation and
 # others. All Rights Reserved.
 #*******************************************************************

 # This script reads in UCD files PropertyAliases.txt and
 # PropertyValueAliases.txt and correlates them with ICU enums
 # defined in uchar.h and uscript.h.  It then outputs a header
 # file which contains all names and enums.  The header is included
 # by the genpname tool C++ source file, which produces the actual
 # binary data file.
 #
 # See usage note below.
 #
 # TODO: The Property[Value]Alias.txt files state that they can support
 # more than 2 names per property|value.  Currently (Unicode 3.2) there
 # are always 1 or 2 names.  If more names were supported, presumably
 # the format would be something like:
 #    nv        ; Numeric_Value
 #    nv        ; Value_Numerique
 # CURRENTLY, this script assumes that there are 1 or two names.  Any
 # duplicates it sees are flagged as an error.  If multiple aliases
 # appear in a future version of Unicode, modify this script to support
 # that.
 #
 # NOTE: As of ICU 2.6, this script has been modified to know about the
 # pseudo-property gcm/General_Category_Mask, which corresponds to the
 # uchar.h property UCHAR_GENERAL_CATEGORY_MASK.  This property
 # corresponds to General_Category but is a bitmask value.  It does not
 # exist in the UCD.  Therefore, I special case it in several places
 # (search for General_Category_Mask and gcm).
 #
 # NOTE: As of ICU 2.6, this script reads an auxiliary data file,
 # SyntheticPropertyAliases.txt, containing property aliases not
 # present in the UCD but present in ICU.  This file resides in the
 # same directory as this script.  Its contents are merged into those
 # of PropertyAliases.txt as if the two files were appended.
 #
 # NOTE: The following names are handled specially.  See script below
 # for details.
 #
 #   T/True
 #   F/False
 #   No_Block
 #
 # Author: Alan Liu
 # Created: October 14 2002
 # Since: ICU 2.4

 use FileHandle;
 use strict;
 use Dumpvalue;

 my $DEBUG = 1;
 my $DUMPER = new Dumpvalue;

 my $count = @ARGV;
 my $ICU_DIR = shift() || '';
 my $OUT_FILE = shift() || 'data.h';
 my $HEADER_DIR = "$ICU_DIR/source/common/unicode";
 my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata";

 # Get the current year from the system
 my $YEAR = 1900+@{[localtime]}[5]; # Get the current year

 # Used to make "n/a" property aliases (Unicode or Synthetic) unique
 my $propNA = 0;

 #----------------------------------------------------------------------
 # Top level property keys for binary, enumerated, string, and double props
 my @TOP     = qw( _bp _ep _sp _dp _mp );

 # This hash governs how top level properties are grouped into output arrays.
 #my %TOP_PROPS = ( "VALUED"   => [ '_bp', '_ep' ],
 #                  "NO_VALUE" => [ '_sp', '_dp' ] );m
 #my %TOP_PROPS = ( "BINARY"   => [ '_bp' ],
 #                  "ENUMERATED" => [ '_ep' ],
 #                  "STRING" => [ '_sp' ],
 #                  "DOUBLE" => [ '_dp' ] );
 my %TOP_PROPS = ( ""   => [ '_bp', '_ep', '_sp', '_dp', '_mp' ] );

 my %PROP_TYPE = (Binary => "_bp",
                  String => "_sp",
                  Double => "_dp",
                  Enumerated => "_ep",
                  Bitmask => "_mp");
 #----------------------------------------------------------------------

 # Properties that are unsupported in ICU
 my %UNSUPPORTED = (Composition_Exclusion => 1,
                    Decomposition_Mapping => 1,
                    Expands_On_NFC => 1,
                    Expands_On_NFD => 1,
                    Expands_On_NFKC => 1,
                    Expands_On_NFKD => 1,
                    FC_NFKC_Closure => 1,
                    ID_Start_Exceptions => 1,
                    Special_Case_Condition => 1,
                    );

 # Short names of properties that weren't seen in uchar.h.  If the
 # properties weren't seen, don't complain about the property values
 # missing.
 my %MISSING_FROM_UCHAR;

 # Additional property aliases beyond short and long names,
 # like space in addition to WSpace and White_Space in Unicode 4.1.
 # Hashtable, maps long name to alias.
 # For example, maps White_Space->space.
 #
 # If multiple additional aliases are defined,
 # then they are separated in the value string with '|'.
 # For example, White_Space->space|outer_space
 my %additional_property_aliases;

 #----------------------------------------------------------------------

 # Emitted class names
 my ($STRING_CLASS, $ALIAS_CLASS, $PROPERTY_CLASS) = qw(AliasName Alias Property);

 if ($count < 1 || $count > 2 ||
     !-d $HEADER_DIR ||
     !-d $UNIDATA_DIR) {
     my $me = $0;
     $me =~ s|.+[/\\]||;
     my $lm = ' ' x length($me);
     print <<"END";

 $me: Reads ICU4C headers and Unicode data files and creates
 $lm  a C header file that is included by genpname.  The header
 $lm  file matches constants defined in the ICU4C headers with
 $lm  property|value aliases in the Unicode data files.

 Usage: $me <icu_dir> [<out_file>]

 <icu_dir>   ICU4C root directory, containing
                source/common/unicode/uchar.h
                source/common/unicode/uscript.h
                source/data/unidata/Blocks.txt
                source/data/unidata/PropertyAliases.txt
                source/data/unidata/PropertyValueAliases.txt
 <out_file>  File name of header to be written;
             default is 'data.h'.

 The Unicode versions of all input files must match.
 END
     exit(1);
 }

 my ($h, $version) = readAndMerge($HEADER_DIR, $UNIDATA_DIR);

 if ($DEBUG) {
     print "Merged hash:\n";
     for my $key (sort keys %$h) {
         my $hh = $h->{$key};
         for my $subkey (sort keys %$hh) {
             print "$key:$subkey:", $hh->{$subkey}, "\n";
         }
     }
 }

 my $out = new FileHandle($OUT_FILE, 'w');
 die "Error: Can't write to $OUT_FILE: $!" unless (defined $out);
 my $save = select($out);
 formatData($h, $version);
 select($save);
 $out->close();

 exit(0);

 #----------------------------------------------------------------------
 # From PropList.html: "The properties of the form Other_XXX
 # are used to generate properties in DerivedCoreProperties.txt.
 # They are not intended for general use, such as in APIs that
 # return property values.
 # Non_Break is not a valid property as of 3.2.
 sub isIgnoredProperty {
     local $_ = shift;
     /^Other_/i || /^Non_Break$/i;
 }

 # 'qc' is a pseudo-property matching any quick-check property
 # see PropertyValueAliases.txt file comments.  'binprop' is
 # a synthetic binary value alias "True"/"False", not present
 # in PropertyValueAliases.txt.
 sub isPseudoProperty {
     $_[0] eq 'qc' ||
         $_[0] eq 'binprop';
 }

 #----------------------------------------------------------------------
 # Emit the combined data from headers and the Unicode database as a
 # C source code header file.
 #
 # @param ref to hash with the data
 # @param Unicode version, as a string
 sub formatData {
     my $h = shift;
     my $version = shift;

     my $date = scalar localtime();
     print <<"END";
 /**
  * Copyright (C) 2002-$YEAR, International Business Machines Corporation and
  * others. All Rights Reserved.
  *
  * MACHINE GENERATED FILE.  !!! Do not edit manually !!!
  *
  * Generated from
  *   uchar.h
  *   uscript.h
  *   Blocks.txt
  *   PropertyAliases.txt
  *   PropertyValueAliases.txt
  *
  * Date: $date
  * Unicode version: $version
  * Script: $0
  */

 END

     #------------------------------------------------------------
     # Emit Unicode version
     print "/* Unicode version $version */\n";
     my @v = split(/\./, $version);
     push @v, '0' while (@v < 4);
     for (my $i=0; $i<@v; ++$i) {
         print "const uint8_t VERSION_$i = $v[$i];\n";
     }
     print "\n";

     #------------------------------------------------------------
     # Emit String table
     # [A table of all identifiers, that is, all long or short property
     # or value names.  The list need NOT be sorted; it will be sorted
     # by the C program.  Strings are referenced by their index into
     # this table.  After sorting, a REMAP[] array is used to map the
     # old position indices to the new positions.]
     my %strings;
     for my $prop (sort keys %$h) {
         my $hh = $h->{$prop};
         for my $enum (sort keys %$hh) {
             my @a = split(/\|/, $hh->{$enum});
             for (@a) {
                 $strings{$_} = 1 if (length($_));
             }
         }
     }
     my @strings = sort keys %strings;
     unshift @strings, "";

     print "const int32_t STRING_COUNT = ", scalar @strings, ";\n\n";

     # while printing, create a mapping hash from string table entry to index
     my %stringToID;
     print "/* to be sorted */\n";
     print "const $STRING_CLASS STRING_TABLE[] = {\n";
     for (my $i=0; $i<@strings; ++$i) {
         print "    $STRING_CLASS(\"$strings[$i]\", $i),\n";
         $stringToID{$strings[$i]} = $i;
     }
     print "};\n\n";

     # placeholder for the remapping index.  this is used to map
     # indices that we compute here to indices of the sorted
     # STRING_TABLE.  STRING_TABLE will be sorted by the C++ program
     # using the uprv_comparePropertyNames() function.  this will
     # reshuffle the order.  we then use the indices (passed to the
     # String constructor) to create a REMAP[] array.
     print "/* to be filled in */\n";
     print "int32_t REMAP[", scalar @strings, "];\n\n";

     #------------------------------------------------------------
     # Emit the name group table
     # [A table of name groups.  A name group is one or more names
     # for a property or property value.  The Unicode data files specify
     # that there may be more than 2, although as of Unicode 3.2 there
     # are at most 2.  The name group table looks like this:
     #
     #  114, -115, 116, -117, 0, -118, 65, -64, ...
     #  [0]        [2]        [4]      [6]
     #
     # The entry at [0] consists of 2 strings, 114 and 115.
     # The entry at [2] consists of 116 and 117.  The entry at
     # [4] is one string, 118.  There is always at least one
     # string; typically there are two.  If there are two, the first
     # is the SHORT name and the second is the LONG.  If there is
     # one, then the missing entry (always the short name, in 3.2)
     # is zero, which is by definition the index of "".  The
     # 'preferred' name will generally be the LONG name, if there are
     # more than 2 entries.  The last entry is negative.

     # Build name group list and replace string refs with nameGroup indices
     my @nameGroups;

     # Check for duplicate name groups, and reuse them if possible
     my %groupToInt; # Map group strings to ints
     for my $prop (sort keys %$h) {
         my $hh = $h->{$prop};
         for my $enum (sort keys %$hh) {
             my $groupString = $hh->{$enum};
             my $i;
             if (exists $groupToInt{$groupString}) {
                 $i = $groupToInt{$groupString};
             } else {
                 my @names = split(/\|/, $groupString);
                 die "Error: Wrong number of names in " . $groupString if (@names < 1);
                 $i = @nameGroups; # index of group we are making
                 $groupToInt{$groupString} = $i; # Cache for reuse
                 push @nameGroups, map { $stringToID{$_} } @names;
                 $nameGroups[$#nameGroups] = -$nameGroups[$#nameGroups]; # mark end
             }
             # now, replace string list with ref to name group
             $hh->{$enum} = $i;
         }
     }

     print "const int32_t NAME_GROUP_COUNT = ",
           scalar @nameGroups, ";\n\n";

     print "int32_t NAME_GROUP[] = {\n";
     # emit one group per line, with annotations
     my $max_names = 0;
     for (my $i=0; $i<@nameGroups; ) {
         my @a;
         my $line;
         my $start = $i;
         for (;;) {
             my $j = $nameGroups[$i++];
             $line .= "$j, ";
             push @a, abs($j);
             last if ($j < 0);
         }
         print "    ",
               $line,
               ' 'x(20-length($line)),
               "/* ", sprintf("%3d", $start),
               ": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n";
         $max_names = @a if(@a > $max_names);

     }
     print "};\n\n";

     # This is fixed for 3.2 at "2" but should be calculated dynamically
     # when more than 2 names appear in Property[Value]Aliases.txt.
     print "#define MAX_NAMES_PER_GROUP $max_names\n\n";

     #------------------------------------------------------------
     # Emit enumerated property values
     for my $prop (sort keys %$h) {
         next if ($prop =~ /^_/);
         my $vh = $h->{$prop};
         my $count = scalar keys %$vh;

         print "const int32_t VALUES_${prop}_COUNT = ",
               $count, ";\n\n";

         print "const $ALIAS_CLASS VALUES_${prop}\[] = {\n";
         for my $enum (sort keys %$vh) {
             #my @names = split(/\|/, $vh->{$enum});
             #die "Error: Wrong number of names for $prop:$enum in [" . join(",", @names) . "]"
             #    if (@names != 2);
             print "    $ALIAS_CLASS((int32_t) $enum, ", $vh->{$enum}, "),\n";
                   #$stringToID{$names[0]}, ", ",
                   #$stringToID{$names[1]}, "),\n";
             #      "\"", $names[0], "\", ",
             #      "\"", $names[1], "\"),\n";
         }
         print "};\n\n";
     }

     #------------------------------------------------------------
     # Emit top-level properties (binary, enumerated, etc.)
     for my $topName (sort keys %TOP_PROPS) {
         my $a = $TOP_PROPS{$topName};
         my $count = 0;
         for my $type (@$a) { # "_bp", "_ep", etc.
             $count += scalar keys %{$h->{$type}};
         }

         print "const int32_t ${topName}PROPERTY_COUNT = $count;\n\n";

         print "const $PROPERTY_CLASS ${topName}PROPERTY[] = {\n";

         for my $type (@$a) { # "_bp", "_ep", etc.
             my $p = $h->{$type};

             for my $enum (sort keys %$p) {
                 my $name = $strings[$nameGroups[$p->{$enum}]];

                 my $valueRef = "0, NULL";
                 if ($type eq '_bp') {
                     $valueRef = "VALUES_binprop_COUNT, VALUES_binprop";
                 }
                 elsif (exists $h->{$name}) {
                     $valueRef = "VALUES_${name}_COUNT, VALUES_$name";
                 }

                 print "    $PROPERTY_CLASS((int32_t) $enum, ",
                       $p->{$enum}, ", $valueRef),\n";
             }
         }
         print "};\n\n";
     }

     print "/*eof*/\n";
 }

 #----------------------------------------------------------------------
 # Read in the files uchar.h, uscript.h, Blocks.txt,
 # PropertyAliases.txt, and PropertyValueAliases.txt,
 # and combine them into one hash.
 #
 # @param directory containing headers
 # @param directory containin Unicode data files
 #
 # @return hash ref, Unicode version
 sub readAndMerge {

     my ($headerDir, $unidataDir) = @_;

     my $h = read_uchar("$headerDir/uchar.h");
     my $s = read_uscript("$headerDir/uscript.h");
     my $b = read_Blocks("$unidataDir/Blocks.txt");
     my $pa = {};
     read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt");
     read_PropertyAliases($pa, "SyntheticPropertyAliases.txt");
     my $va = read_PropertyValueAliases("$unidataDir/PropertyValueAliases.txt");

     # Extract property family hash
     my $fam = $pa->{'_family'};
     delete $pa->{'_family'};

     # Note: uscript.h has no version string, so don't check it
     my $version = check_versions([ 'uchar.h', $h ],
                                  [ 'Blocks.txt', $b ],
                                  [ 'PropertyAliases.txt', $pa ],
                                  [ 'PropertyValueAliases.txt', $va ]);

     # Do this BEFORE merging; merging modifies the hashes
     check_PropertyValueAliases($pa, $va);

     # Dump out the $va hash for debugging
     if ($DEBUG) {
         print "Property values hash:\n";
         for my $key (sort keys %$va) {
             my $hh = $va->{$key};
             for my $subkey (sort keys %$hh) {
                 print "$key:$subkey:", $hh->{$subkey}, "\n";
             }
         }
     }

     # Dump out the $s hash for debugging
     if ($DEBUG) {
         print "Script hash:\n";
         for my $key (sort keys %$s) {
             print "$key:", $s->{$key}, "\n";
         }
     }

     # Link in the script data
     $h->{'sc'} = $s;

     merge_Blocks($h, $b);

     merge_PropertyAliases($h, $pa, $fam);

     merge_PropertyValueAliases($h, $va);

     ($h, $version);
 }

 #----------------------------------------------------------------------
 # Ensure that the version strings in the given hashes (under the key
 # '_version') are compatible.  Currently this means they must be
 # identical, with the exception that "X.Y" will match "X.Y.0".
 # All hashes must define the key '_version'.
 #
 # @param a list of pairs of (file name, hash reference)
 #
 # @return the version of all the hashes.  Upon return, the '_version'
 # will be removed from all hashes.
 sub check_versions {
     my $version = '';
     my $msg = '';
     foreach my $a (@_) {
         my $name = $a->[0];
         my $h    = $a->[1];
         die "Error: No version found" unless (exists $h->{'_version'});
         my $v = $h->{'_version'};
         delete $h->{'_version'};

         # append ".0" if necessary, to standardize to X.Y.Z
         $v .= '.0' unless ($v =~ /\.\d+\./);
         $v .= '.0' unless ($v =~ /\.\d+\./);
         $msg .= "$name = $v\n";
         if ($version) {
             die "Error: Mismatched Unicode versions\n$msg"
                 unless ($version eq $v);
         } else {
             $version = $v;
         }
     }
     $version;
 }

 #----------------------------------------------------------------------
 # Make sure the property names in PropertyValueAliases.txt match those
 # in PropertyAliases.txt.
 #
 # @param a hash ref from read_PropertyAliases.
 # @param a hash ref from read_PropertyValueAliases.
 sub check_PropertyValueAliases {
     my ($pa, $va) = @_;

     # make a reverse hash of short->long
     my %rev;
     for (keys %$pa) { $rev{$pa->{$_}} = $_; }

     for my $prop (keys %$va) {
         if (!exists $rev{$prop} && !isPseudoProperty($prop)) {
             print "Warning: Property $prop from PropertyValueAliases not listed in PropertyAliases\n";
         }
     }
 }

 #----------------------------------------------------------------------
 # Merge blocks data into uchar.h enum data.  In the 'blk' subhash all
 # code point values, as returned from read_uchar, are replaced by
 # block names, as read from Blocks.txt and returned by read_Blocks.
 # The match must be 1-to-1.  If there is any failure of 1-to-1
 # mapping, an error is signaled.  Upon return, the read_Blocks hash
 # is emptied of all contents, except for those that failed to match.
 #
 # The mapping in the 'blk' subhash, after this function returns, is
 # from uchar.h enum name, e.g. "UBLOCK_BASIC_LATIN", to Blocks.h
 # pseudo-name, e.g. "Basic Latin".
 #
 # @param a hash ref from read_uchar.
 # @param a hash ref from read_Blocks.
 sub merge_Blocks {
     my ($h, $b) = @_;

     die "Error: No blocks data in uchar.h"
         unless (exists $h->{'blk'});
     my $blk = $h->{'blk'};
     for my $enum (keys %$blk) {
         my $cp = $blk->{$enum};
         if ($cp && !exists $b->{$cp}) {
             die "Error: No block found at $cp in Blocks.txt";
         }
         # Convert code point to pseudo-name:
         $blk->{$enum} = $b->{$cp};
         delete $b->{$cp};
     }
     my $err = '';
     for my $cp (keys %$b) {
         $err .= "Error: Block " . $b->{$cp} . " not listed in uchar.h\n";
     }
     die $err if ($err);
 }

 #----------------------------------------------------------------------
 # Merge property alias names into the uchar.h hash.  The subhashes
 # under the keys _* (b(inary, e(numerated, s(tring, d(ouble) are
 # examined and the values of those subhashes are assumed to be long
 # names in PropertyAliases.txt.  They are validated and replaced by
 # "<short>|<long>".  Upon return, the read_PropertyAliases hash is
 # emptied of all contents, except for those that failed to match.
 # Unmatched names in PropertyAliases are listed as a warning but do
 # NOT cause the script to die.
 #
 # @param a hash ref from read_uchar.
 # @param a hash ref from read_PropertyAliases.
 # @param a hash mapping long names to property family (e.g., 'binary')
 sub merge_PropertyAliases {
     my ($h, $pa, $fam) = @_;

     for my $k (@TOP) {
         die "Error: No properties data for $k in uchar.h"
             unless (exists $h->{$k});
     }

     for my $subh (map { $h->{$_} } @TOP) {
         for my $enum (keys %$subh) {
             my $long_name = $subh->{$enum};
             if (!exists $pa->{$long_name}) {
                 die "Error: Property $long_name not found (or used more than once)";
             }

             my $value;
             if($pa->{$long_name} =~ m|^n/a\d*$|) {
                 # replace an "n/a" short name with an empty name (nothing before "|");
                 # don't remove it (don't remove the "|"): there must always be a long name,
                 # and if the short name is removed, then the long name becomes the
                 # short name and there is no long name left (unless there is another alias)
                 $value = "|" . $long_name;
             } else {
                 $value = $pa->{$long_name} . "|" . $long_name;
             }
             if (exists $additional_property_aliases{$long_name}) {
                 $value .= "|" . $additional_property_aliases{$long_name};
             }
             $subh->{$enum} = $value;
             delete $pa->{$long_name};
         }
     }

     my @err;
     for my $name (keys %$pa) {
         $MISSING_FROM_UCHAR{$pa->{$name}} = 1;
         if (exists $UNSUPPORTED{$name}) {
             push @err, "Info: No enum for " . $fam->{$name} . " property $name in uchar.h";
         } elsif (!isIgnoredProperty($name)) {
             push @err, "Warning: No enum for " . $fam->{$name} . " property $name in uchar.h";
         }
     }
     print join("\n", sort @err), "\n" if (@err);
 }

 #----------------------------------------------------------------------
 # Return 1 if two names match ignoring whitespace, '-', and '_'.
 # Used to match names in Blocks.txt with those in PropertyValueAliases.txt
 # as of Unicode 4.0.
 sub matchesLoosely {
     my ($a, $b) = @_;
     $a =~ s/[\s\-_]//g;
     $b =~ s/[\s\-_]//g;
     $a =~ /^$b$/i;
 }

 #----------------------------------------------------------------------
 # Merge PropertyValueAliases.txt data into the uchar.h hash.  All
 # properties other than blk, _bp, and _ep are analyzed and mapped to
 # the names listed in PropertyValueAliases.  They are then replaced
 # with a string of the form "<short>|<long>".  The short or long name
 # may be missing.
 #
 # @param a hash ref from read_uchar.
 # @param a hash ref from read_PropertyValueAliases.
 sub merge_PropertyValueAliases {
     my ($h, $va) = @_;

     my %gcCount;
     for my $prop (keys %$h) {
         # _bp, _ep handled in merge_PropertyAliases
         next if ($prop =~ /^_/);

         # Special case: gcm
         my $prop2 = ($prop eq 'gcm') ? 'gc' : $prop;

         # find corresponding PropertyValueAliases data
         die "Error: Can't find $prop in PropertyValueAliases.txt"
             unless (exists $va->{$prop2});
         my $pva = $va->{$prop2};

         # match up data
         my $hh = $h->{$prop};
         for my $enum (keys %$hh) {

             my $name = $hh->{$enum};

             # look up both long and short & ignore case
             my $n;
             if (exists $pva->{$name}) {
                 $n = $name;
             } else {
                 # iterate (slow)
                 for my $a (keys %$pva) {
                     # case-insensitive match
                     # & case-insensitive reverse match
                     if ($a =~ /^$name$/i ||
                         $pva->{$a} =~ /^$name$/i) {
                         $n = $a;
                         last;
                     }
                 }
             }

             # For blocks, do a loose match from Blocks.txt pseudo-name
             # to PropertyValueAliases long name.
             if (!$n && $prop eq 'blk') {
                 for my $a (keys %$pva) {
                     # The block is only going to match the long name,
                     # but we check both for completeness.  As of Unicode
                     # 4.0, blocks do not have short names.
                     if (matchesLoosely($name, $pva->{$a}) ||
                         matchesLoosely($name, $a)) {
                         $n = $a;
                         last;
                     }
                 }
             }

             die "Error: Property value $prop:$name not found" unless ($n);

             my $l = $n;
             my $r = $pva->{$n};
             # convert |n/a\d*| to blank
             $l = '' if ($l =~ m|^n/a\d*$|);
             $r = '' if ($r =~ m|^n/a\d*$|);

             $hh->{$enum} = "$l|$r";
             # Don't delete the 'gc' properties because we need to share
             # them between 'gc' and 'gcm'.  Count each use instead.
             if ($prop2 eq 'gc') {
                 ++$gcCount{$n};
             } else {
                 delete $pva->{$n};
             }
         }
     }

     # Merge the combining class values in manually
     # Add the same values to the synthetic lccc and tccc properties
     die "Error: No ccc data"
         unless exists $va->{'ccc'};
     for my $ccc (keys %{$va->{'ccc'}}) {
         die "Error: Can't overwrite ccc $ccc"
             if (exists $h->{'ccc'}->{$ccc});
         $h->{'lccc'}->{$ccc} =
         $h->{'tccc'}->{$ccc} =
         $h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc};
     }
     delete $va->{'ccc'};

     # Merge synthetic binary property values in manually.
     # These are the "True" and "False" value aliases.
     die "Error: No True/False value aliases"
         unless exists $va->{'binprop'};
     for my $bp (keys %{$va->{'binprop'}}) {
         $h->{'binprop'}->{$bp} = $va->{'binprop'}->{$bp};
     }
     delete $va->{'binprop'};

     my $err = '';
     for my $prop (sort keys %$va) {
         my $hh = $va->{$prop};
         for my $subkey (sort keys %$hh) {
             # 'gc' props are shared with 'gcm'; make sure they were used
             # once or twice.
             if ($prop eq 'gc') {
                 my $n = $gcCount{$subkey};
                 next if ($n >= 1 && $n <= 2);
             }
             $err .= "Warning: Enum for value $prop:$subkey not found in uchar.h\n"
                 unless exists $MISSING_FROM_UCHAR{$prop};
         }
     }
     print $err if ($err);
 }

 #----------------------------------------------------------------------
 # Read the PropertyAliases.txt file.  Return a hash that maps the long
 # name to the short name.  The special key '_version' will map to the
 # Unicode version of the file.  The special key '_family' holds a
 # subhash that maps long names to a family string, for descriptive
 # purposes.
 #
 # @param a filename for PropertyAliases.txt
 # @param reference to hash to receive data.  Keys are long names.
 # Values are short names.
 sub read_PropertyAliases {

     my $hash = shift;         # result

     my $filename = shift;

     my $fam = {};  # map long names to family string
     $fam = $hash->{'_family'} if (exists $hash->{'_family'});

     my $family; # binary, enumerated, etc.

     my $in = new FileHandle($filename, 'r');
     die "Error: Cannot open $filename" if (!defined $in);

     while (<$in>) {

         # Read version (embedded in a comment)
         if (/PropertyAliases-(\d+\.\d+\.\d+)/i) {
             die "Error: Multiple versions in $filename"
                 if (exists $hash->{'_version'});
             $hash->{'_version'} = $1;
         }

         # Read family heading
         if (/^\s*\#\s*(.+?)\s*Properties\s*$/) {
             $family = $1;
         }

         # Ignore comments and blank lines
         s/\#.*//;
         next unless (/\S/);

         if (/^\s*(.+?)\s*;/) {
             my $short = $1;
             my @fields = /;\s*([^\s;]+)/g;
             if (@fields < 1 || @fields > 2) {
                 my $number = @fields;
                 die "Error: Wrong number of fields ($number) in $filename at $_";
             }

             # Make "n/a" strings unique
             if ($short eq 'n/a') {
                 $short .= sprintf("%03d", $propNA++);
             }
             my $long = $fields[0];
             if ($long eq 'n/a') {
                 $long .= sprintf("%03d", $propNA++);
             }

             # Add long name->short name to the hash=pa hash table
             if (exists $hash->{$long}) {
                 die "Error: Duplicate property $long in $filename"
             }
             $hash->{$long} = $short;
             $fam->{$long} = $family;

             # Add the list of further aliases to the additional_property_aliases hash table,
             # using the long property name as the key.
             # For example:
             #   White_Space->space|outer_space
             if (@fields > 1) {
                 my $value = pop @fields;
                 while (@fields > 1) {
                     $value .= "|" . pop @fields;
                 }
                 $additional_property_aliases{$long} = $value;
             }
         } else {
             die "Error: Can't parse $_ in $filename";
         }
     }

     $in->close();

     $hash->{'_family'} = $fam;
 }

 #----------------------------------------------------------------------
 # Read the PropertyValueAliases.txt file.  Return a two level hash
 # that maps property_short_name:value_short_name:value_long_name.  In
 # the case of the 'ccc' property, the short name is the numeric class
 # and the long name is "<short>|<long>".  The special key '_version'
 # will map to the Unicode version of the file.
 #
 # @param a filename for PropertyValueAliases.txt
 #
 # @return a hash reference.
 sub read_PropertyValueAliases {

     my $filename = shift;

     my $hash = {};         # result

     my $in = new FileHandle($filename, 'r');
     die "Error: Cannot open $filename" if (!defined $in);

     my $valueNA = 0; # Used to make "n/a" strings unique

     while (<$in>) {

         # Read version (embedded in a comment)
         if (/PropertyValueAliases-(\d+\.\d+\.\d+)/i) {
             die "Error: Multiple versions in $filename"
                 if (exists $hash->{'_version'});
             $hash->{'_version'} = $1;
         }

         # Ignore comments and blank lines
         s/\#.*//;
         next unless (/\S/);

         if (/^\s*(.+?)\s*;/i) {
             my $prop = $1;
             my @fields = /;\s*([^\s;]+)/g;
             die "Error: Wrong number of fields in $filename"
                 if (@fields < 2 || @fields > 3);
             # Make "n/a" strings unique
             $fields[0] .= sprintf("%03d", $valueNA++) if ($fields[0] eq 'n/a');
             # Squash extra fields together
             while (@fields > 2) {
                 my $f = pop @fields;
                 $fields[$#fields] .= '|' . $f;
             }
             addDatum($hash, $prop, @fields);
         }

         else {
             die "Error: Can't parse $_ in $filename";
         }
     }

     $in->close();

     # Script Copt=Qaac (Coptic) is a special case.
     # Before the Copt code was defined, the private-use code Qaac was used.
     # Starting with Unicode 4.1, PropertyValueAliases.txt contains
     # Copt as the short name as well as Qaac as an alias.
     # For use with older Unicode data files, we add here a Qaac->Coptic entry.
     # This should not do anything for 4.1-and-later Unicode data files.
     # See also UAX #24: Script Names http://www.unicode.org/unicode/reports/tr24/
     $hash->{'sc'}->{'Qaac'} = 'Coptic'
         unless (exists $hash->{'sc'}->{'Qaac'} || exists $hash->{'sc'}->{'Copt'});

     # Add T|True and F|False -- these are values we recognize for
     # binary properties (NOT from PropertyValueAliases.txt).  These
     # are of the same form as the 'ccc' value aliases.
     $hash->{'binprop'}->{'0'} = 'F|False';
     $hash->{'binprop'}->{'1'} = 'T|True';

     $hash;
 }

 #----------------------------------------------------------------------
 # Read the Blocks.txt file.  Return a hash that maps the code point
 # range start to the block name.  The special key '_version' will map
 # to the Unicode version of the file.
 #
 # As of Unicode 4.0, the names in the Blocks.txt are no longer the
 # proper names.  The proper names are now listed in PropertyValueAliases.
 # They are similar but not identical.  Furthermore, 4.0 introduces
 # a new block name, No_Block, which is listed only in PropertyValueAliases
 # and not in Blocks.txt.  As a result, we handle blocks as follows:
 #
 # 1. Read Blocks.txt to map code point range start to quasi-block name.
 # 2. Add to Blocks.txt a synthetic No Block code point & name:
 #    X -> No Block
 # 3. Map quasi-names from Blocks.txt (including No Block) to actual
 #    names from PropertyValueAliases.  This occurs in
 #    merge_PropertyValueAliases.
 #
 # @param a filename for Blocks.txt
 #
 # @return a ref to a hash.  Keys are code points, as text, e.g.,
 # "1720".  Values are pseudo-block names, e.g., "Hanunoo".
 sub read_Blocks {

     my $filename = shift;

     my $hash = {};         # result

     my $in = new FileHandle($filename, 'r');
     die "Error: Cannot open $filename" if (!defined $in);

     while (<$in>) {

         # Read version (embedded in a comment)
         if (/Blocks-(\d+\.\d+\.\d+)/i) {
             die "Error: Multiple versions in $filename"
                 if (exists $hash->{'_version'});
             $hash->{'_version'} = $1;
         }

         # Ignore comments and blank lines
         s/\#.*//;
         next unless (/\S/);

         if (/^([0-9a-f]+)\.\.[0-9a-f]+;\s*(.+?)\s*$/i) {
             die "Error: Duplicate range $1 in $filename"
                 if (exists $hash->{$1});
             $hash->{$1} = $2;
         }

         else {
             die "Error: Can't parse $_ in $filename";
         }
     }

     $in->close();

     # Add pseudo-name for No Block
     $hash->{'none'} = 'No Block';

     $hash;
 }

 #----------------------------------------------------------------------
 # Read the uscript.h file and compile a mapping of Unicode symbols to
 # icu4c enum values.
 #
 # @param a filename for uscript.h
 #
 # @return a ref to a hash.  The keys of the hash are enum symbols from
 # uscript.h, and the values are script names.
 sub read_uscript {

     my $filename = shift;

     my $mode = '';         # state machine mode and submode
     my $submode = '';

     my $last = '';         # for line folding

     my $hash = {};         # result
     my $key;               # first-level key

     my $in = new FileHandle($filename, 'r');
     die "Error: Cannot open $filename" if (!defined $in);

     while (<$in>) {
         # Fold continued lines together
         if (/^(.*)\\$/) {
             $last = $1;
             next;
         } elsif ($last) {
             $_ = $last . $_;
             $last = '';
         }

         # Exit all modes here
         if ($mode && $mode ne 'DEPRECATED') {
             if (/^\s*\}/) {
                 $mode = '';
                 next;
             }
         }

         # Handle individual modes

         if ($mode eq 'UScriptCode') {
             if (m|^\s*(USCRIPT_\w+).+?/\*\s*(\w+)|) {
                 my ($enum, $code) = ($1, $2);
                 die "Error: Duplicate script $enum"
                     if (exists $hash->{$enum});
                 $hash->{$enum} = $code;
             }
         }

         elsif ($mode eq 'DEPRECATED') {
             if (/\s*\#ifdef/) {
                 die "Error: Nested #ifdef";
                 }
             elsif (/\s*\#endif/) {
                 $mode = '';
             }
         }

         elsif (!$mode) {
             if (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
                    /^\s*typedef\s+enum\s+(\w+)\s*$/) {
                 $mode = $1;
                 #print "Parsing $mode\n";
             }

             elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
                 $mode = 'DEPRECATED';
             }
         }
     }

     $in->close();

     $hash;
 }

 #----------------------------------------------------------------------
 # Read the uchar.h file and compile a mapping of Unicode symbols to
 # icu4c enum values.
 #
 # @param a filename for uchar.h
 #
 # @return a ref to a hash.  The keys of the hash are '_bp' for binary
 # properties, '_ep' for enumerated properties, '_dp'/'_sp'/'_mp' for
 # double/string/mask properties, and 'gc', 'gcm', 'bc', 'blk',
 # 'ea', 'dt', 'jt', 'jg', 'lb', or 'nt' for corresponding property
 # value aliases.  The values of the hash are subhashes.  The subhashes
 # have a key of the uchar.h enum symbol, and a value of the alias
 # string (as listed in PropertyValueAliases.txt).  NOTE: The alias
 # string is whatever alias uchar.h lists.  This may be either short or
 # long, depending on the specific enum.  NOTE: For blocks ('blk'), the
 # value is a hex code point for the start of the associated block.
 # NOTE: The special key _version will map to the Unicode version of
 # the file.
 sub read_uchar {

     my $filename = shift;

     my $mode = '';         # state machine mode and submode
     my $submode = '';

     my $last = '';         # for line folding

     my $hash = {};         # result
     my $key;               # first-level key

     my $in = new FileHandle($filename, 'r');
     die "Error: Cannot open $filename" if (!defined $in);

     while (<$in>) {
         # Fold continued lines together
         if (/^(.*)\\$/) {
             $last .= $1;
             next;
         } elsif ($last) {
             $_ = $last . $_;
             $last = '';
         }

         # Exit all modes here
         if ($mode && $mode ne 'DEPRECATED') {
             if (/^\s*\}/) {
                 $mode = '';
                 next;
             }
         }

         # Handle individual modes

         if ($mode eq 'UProperty') {
             if (/^\s*(UCHAR_\w+)\s*[,=]/ || /^\s+(UCHAR_\w+)\s*$/) {
                 if ($submode) {
                     addDatum($hash, $key, $1, $submode);
                     $submode = '';
                 } else {
                     #print "Warning: Ignoring $1\n";
                 }
             }

             elsif (m|^\s*/\*\*\s*(\w+)\s+property\s+(\w+)|i) {
                 die "Error: Unmatched tag $submode" if ($submode);
                 die "Error: Unrecognized UProperty comment: $_"
                     unless (exists $PROP_TYPE{$1});
                 $key = $PROP_TYPE{$1};
                 $submode = $2;
             }
         }

         elsif ($mode eq 'UCharCategory') {
             if (/^\s*(U_\w+)\s*=/) {
                 if ($submode) {
                     addDatum($hash, 'gc', $1, $submode);
                     $submode = '';
                 } else {
                     #print "Warning: Ignoring $1\n";
                 }
             }

             elsif (m|^\s*/\*\*\s*([A-Z][a-z])\s|) {
                 die "Error: Unmatched tag $submode" if ($submode);
                 $submode = $1;
             }
         }

         elsif ($mode eq 'UCharDirection') {
             if (/^\s*(U_\w+)\s*[,=]/ || /^\s+(U_\w+)\s*$/) {
                 if ($submode) {
                     addDatum($hash, $key, $1, $submode);
                     $submode = '';
                 } else {
                     #print "Warning: Ignoring $1\n";
                 }
             }

             elsif (m|/\*\*\s*([A-Z]+)\s|) {
                 die "Error: Unmatched tag $submode" if ($submode);
                 $key = 'bc';
                 $submode = $1;
             }
         }

         elsif ($mode eq 'UBlockCode') {
             if (m|^\s*(UBLOCK_\w+).+?/\*\[(.+?)\]\*/|) {
                 addDatum($hash, 'blk', $1, $2);
             }
         }

         elsif ($mode eq 'UEastAsianWidth') {
             if (m|^\s*(U_EA_\w+).+?/\*\[(.+?)\]\*/|) {
                 addDatum($hash, 'ea', $1, $2);
             }
         }

         elsif ($mode eq 'UDecompositionType') {
             if (m|^\s*(U_DT_\w+).+?/\*\[(.+?)\]\*/|) {
                 addDatum($hash, 'dt', $1, $2);
             }
         }

         elsif ($mode eq 'UJoiningType') {
             if (m|^\s*(U_JT_\w+).+?/\*\[(.+?)\]\*/|) {
                 addDatum($hash, 'jt', $1, $2);
             }
         }

         elsif ($mode eq 'UJoiningGroup') {
             if (/^\s*(U_JG_(\w+))/) {
                 addDatum($hash, 'jg', $1, $2) unless ($2 eq 'COUNT');
             }
         }

         elsif ($mode eq 'UGraphemeClusterBreak') {
             if (m|^\s*(U_GCB_\w+).+?/\*\[(.+?)\]\*/|) {
                 addDatum($hash, 'GCB', $1, $2);
             }
         }

         elsif ($mode eq 'UWordBreakValues') {
             if (m|^\s*(U_WB_\w+).+?/\*\[(.+?)\]\*/|) {
                 addDatum($hash, 'WB', $1, $2);
             }
         }

         elsif ($mode eq 'USentenceBreak') {
             if (m|^\s*(U_SB_\w+).+?/\*\[(.+?)\]\*/|) {
                 addDatum($hash, 'SB', $1, $2);
             }
         }

         elsif ($mode eq 'ULineBreak') {
             if (m|^\s*(U_LB_\w+).+?/\*\[(.+?)\]\*/|) {
                 addDatum($hash, 'lb', $1, $2);
             }
         }

         elsif ($mode eq 'UNumericType') {
             if (m|^\s*(U_NT_\w+).+?/\*\[(.+?)\]\*/|) {
                 addDatum($hash, 'nt', $1, $2);
             }
         }

         elsif ($mode eq 'UHangulSyllableType') {
             if (m|^\s*(U_HST_\w+).+?/\*\[(.+?)\]\*/|) {
                 addDatum($hash, 'hst', $1, $2);
             }
         }

         elsif ($mode eq 'DEPRECATED') {
             if (/\s*\#ifdef/) {
                 die "Error: Nested #ifdef";
                 }
             elsif (/\s*\#endif/) {
                 $mode = '';
             }
         }

         elsif (!$mode) {
             if (/^\s*\#define\s+(\w+)\s+(.+)/) {
                 # #define $left $right
                 my ($left, $right) = ($1, $2);

                 if ($left eq 'U_UNICODE_VERSION') {
                     my $version = $right;
                     $version = $1 if ($version =~ /^\"(.*)\"/);
                     # print "Unicode version: ", $version, "\n";
                     die "Error: Multiple versions in $filename"
                         if (defined $hash->{'_version'});
                     $hash->{'_version'} = $version;
                 }

                 elsif ($left =~ /U_GC_(\w+?)_MASK/) {
                     addDatum($hash, 'gcm', $left, $1);
                 }
             }

             elsif (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
                    /^\s*typedef\s+enum\s+(\w+)\s*$/) {
                 $mode = $1;
                 #print "Parsing $mode\n";
             }

             elsif (/^\s*enum\s+(\w+)\s*\{/ ||
                    /^\s*enum\s+(\w+)\s*$/) {
                 $mode = $1;
                 #print "Parsing $mode\n";
             }

             elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
                 $mode = 'DEPRECATED';
             }
         }
     }

     $in->close();

     # hardcode known values for the normalization quick check properties
     # see unorm.h for the UNormalizationCheckResult enum

     addDatum($hash, 'NFC_QC', 'UNORM_NO',    'N');
     addDatum($hash, 'NFC_QC', 'UNORM_YES',   'Y');
     addDatum($hash, 'NFC_QC', 'UNORM_MAYBE', 'M');

     addDatum($hash, 'NFKC_QC', 'UNORM_NO',    'N');
     addDatum($hash, 'NFKC_QC', 'UNORM_YES',   'Y');
     addDatum($hash, 'NFKC_QC', 'UNORM_MAYBE', 'M');

     # no "maybe" values for NF[K]D

     addDatum($hash, 'NFD_QC', 'UNORM_NO',    'N');
     addDatum($hash, 'NFD_QC', 'UNORM_YES',   'Y');

     addDatum($hash, 'NFKD_QC', 'UNORM_NO',    'N');
     addDatum($hash, 'NFKD_QC', 'UNORM_YES',   'Y');

     $hash;
 }

 #----------------------------------------------------------------------
 # Add a new value to a two-level hash.  That is, given a ref to
 # a hash, two keys, and a value, add $hash->{$key1}->{$key2} = $value.
 sub addDatum {
     my ($h, $k1, $k2, $v) = @_;
     if (exists $h->{$k1}->{$k2}) {
         die "Error: $k1:$k2 already set to " .
             $h->{$k1}->{$k2} . ", cannot set to " . $v;
     }
     $h->{$k1}->{$k2} = $v;
 }

 #eof