src/com/ibm/icu/dev/tool/translit/indic.pl - external/github.com/unicode-org/icu - Git at Google

 #/**
 # *******************************************************************************
 # * Copyright (C) 2000-2004, International Business Machines Corporation and    *
 # * others. All Rights Reserved.                                                *
 # *******************************************************************************
 # */

 #!perl

 # Usage - $0 <remap file>
 #  e.g. - indic indic.txt
 # The input file should be a subset of the Unicode data file containing
 # the blocks of interest.
 #
 # The remap file should have lines of the form
 # "\u0D01>\u0D02;"
 # including the quotes.  These will be interpreted as saying that the
 # undefined code point U+D01 (derived via mapping from InterIndic)
 # can be remapped to U+D02.
 #
 # The purpose of this script is to process the Indic script data into
 # a form usable by the IndicTransliterator, that is, the Indic-Indic
 # transliterator.  The transliterator needs two things: A mapping of
 # the code points in common, and a list of the exceptions.

 # Assume we are located in icu4j/src/com/ibm/tools/translit/.
 # We want the Unicode DB in icu4j/src/data/unicode/.
 $UNICODE_DB = "../../../../data/unicode/UnicodeData.txt";
 $EXCEPTIONS_FILE = shift;

 # Assume we are located in icu4j/src/com/ibm/tools/translit/.
 # We want to output files to icu4j/src/com/ibm/text/resources/.
 # Output directory
 $OUTDIR = "../../text/resources";

 # The template file should contain java code that can be used
 # to generate RuleBasedTransliterator resource files.  The template
 # should contain the following embedded symbols, which this script
 # will replace:
 # $TOOL - name of generating tool
 # $DATE - date of generation
 # $SCRIPTFROM - name of source script
 # $SCRIPTTO - name of target script
 # $RULES - rules
 $RBT_TEMPLATE = 'rbtTemplate.txt';

 # Name of this tool in generated RBT files
 $RBT_GEN_TOOL = 'icu4j/src/com/ibm/tools/translit/indic.pl';

 $DUMP = 0; # If 1, dump out internal data

 $DO_HEURISTIC_REMAP = 0; # If 1, do automatic heuristic remapping
 $DO_DECOMP_REMAP = 0; # If 1, do decomp remapping

 open(UNICODE_DB);
 while (<UNICODE_DB>) {
     next if (m|^0[0-8]|); # Skip up to Devanagari block (0900)
     last if (m|^0D[8-F]|i); # Bail out after Malayam block (0D00)
     # 0D39;MALAYALAM LETTER HA;Lo;0;L;;;;;N;;;;;
     my @data = split(/;/);
     my $fullCode = hex($data[0]); # e.g., 0x093F
     my $code = $fullCode & 0x7F; # e.g., 0x3F
     my ($script, $name) = ($data[1] =~ /(\w+)\s+(.+)/);
     die "Can't parse $_" unless ($name);
     # e.g., $code/$script/$name = 3F/MALAYALAM/VOWEL SIGN I

     # Titlecase the script
     $script = ucfirst(lc($script));

     # Fix a couple inconsistencies in the 3.0 data
     # REVISIT: Is this okay to do?
     if ($DO_HEURISTIC_REMAP) {
         if ($script eq 'Gujarati' && $code >= 5 && $code <= 0x14) {
             $name =~ s/^VOWEL/LETTER/;
         }
     }

     # Keep track of all script names we encounter.  We also note the
     # base of the block.
     my $base = $fullCode & ~0x7F; # e.g., 0x900;
     if (exists $SCRIPT_TO_BASE{$script}) {
         die "Script base mismatch for $script: $base vs. $SCRIPT_TO_BASE{$script}"
             if ($SCRIPT_TO_BASE{$script} ne $base);
     } else {
         $SCRIPT_TO_BASE{$script} = $base;
     }

     # Build up a mapping by name.  For each name, keep a hash keyed by
     # code point.  For each code point, keep an array of script names.
     # Also keep a total use count for each name.
     push @{$NAME_CODE_TO_SCRIPTS{$name}{$code}}, $script;
     ++$NAME_CODE_TO_SCRIPTS{$name}{count};

     # Build a map that looks like this:
     # $SCRIPT_NAME_TO_CODE{<script>}{<name>} = <code>
     # or undef if there is no mapping.
     $SCRIPT_NAME_TO_CODE{$script}{$name} = $code;

     # Build a map that looks like this:
     $SCRIPT_CODE_TO_NAME{$script}{$code} = $name;

     # And a map from the fullCode point to the name
     $FULLCODE_TO_NAME{$fullCode} = $name;

     # Map code (0..7F) to name.  This is usually a 1-1 mapping, but
     # is 1-n in a few cases.
     if (exists $CODE_TO_NAME{$code}) {
         if ($name ne $CODE_TO_NAME{$code}) {
             # For multiple names on a code offset, use the format
             # (a/b), (a/b/c), etc.
             local $_ = $CODE_TO_NAME{$code};
             if (m|^\(|) {
                 if (!m|[\(\)/]$name[\(\)/]|) {
                     s|\)$|/$name\)|;
                 }
             } else {
                 $_ = "($_/$name)";
             }
             $CODE_TO_NAME{$code} = $_;
         }
     } else {
         $CODE_TO_NAME{$code} = $name;
     }
 }
 close(UNICODE_DB);

 # Read and parse the manual remapping file.  This contains lines
 # of the form:

 # |"\u0956>\u0948;"  // AI Length Mark -> Devanagari Vowel Sign AI

 # The left hand side contains a non-existent full code value.  It
 # should be a single value.  The right hand side contains one or more
 # real full code values.  The idea is that when a mapping from another
 # script ends up at the non-existent code point on the left, the
 # sequence on the right should be substituted.  In this example,
 # Devanagari has no AI Length Mark.  So, if transliterating from
 # Oriya, then the character 0B56 (Oriya AI Length Mark) will remap to
 # the non-existent 0956, and that remaps to 0948, our chosen
 # Devanagari equivalent.  For our purposes, the left hand side should
 # be taken to mean its equivalent point in the InterIndic range.  In
 # this example, what it really says is E056>0948 in the
 # InterIndic-Devanagari transliterator.

 if ($EXCEPTIONS_FILE) {
     open(EXCEPTIONS_FILE) or die;
     while (<EXCEPTIONS_FILE>) {
         if (m|^\s*\"([^\"]*?)\"|) {
             my $line = $_;
             $_ = $1;
             if (/^(.*)>(.*);$/) {
                 my ($rawFrom, $rawTo) = ($1, $2);
                 my @from = parseUnicodeEscape($rawFrom);
                 my @to = parseUnicodeEscape($rawTo);
                 my $from = hexArray(@from);
                 # Some entries look like this:
                 # |"\u0955>\u0955;"
                 # these do nothing; ignore them.
                 if (intArraysEqual(\@from, \@to)) {
                     #print STDERR "Ignoring NOOP remap of $from\n";
                 } elsif (exists $EXCEPTIONS{$from}) {
                     print STDERR "ERROR in $EXCEPTIONS_FILE - Duplicate remap entries for $from\n";
                 } elsif (scalar @from > 1) {
                     print STDERR "ERROR in $EXCEPTIONS_FILE - Ignoring multichar remap: ", hexArray(@from), "->", hexArray(@to), "\n";
                 } else {
                     # Check this for validity.  Full code on the left
                     # should NOT exist.  Full code seq on the right should.
                     if (exists $FULLCODE_TO_NAME{$from[0]}) {
                         print STDERR "ERROR in $EXCEPTIONS_FILE - Invalid remap; left side defined: ", hexArray(@from), "->", hexArray(@to), "\n";
                     } elsif (grep(! exists $FULLCODE_TO_NAME{$_}, @to)) {
                         print STDERR "ERROR in $EXCEPTIONS_FILE - Invalid remap; right side undefined: ", hexArray(@from), "->", hexArray(@to), "\n";
                     } else {
                         $EXCEPTIONS{$from[0]} = \@to;
                     }
                 }
             } else { die "ERROR in $EXCEPTIONS_FILE - Can't parse \"$_\" in line $line"; }
         }
     }
     close(EXCEPTIONS_FILE);
     print STDERR "$EXCEPTIONS_FILE: Loaded ", scalar keys %EXCEPTIONS, " remappings\n";
 }

 if ($DO_DECOMP_REMAP) {
     # Read the NamesList.txt file.  This contains decomposition data.
     # Gather these into %DECOMP, which maps a name to n1.n2..., where n1
     # etc. are decomposed names.  E.g. $DECOMP{'LETTER RRA'} -> 'LETTER
     # RA.SIGN NUKTA'.  There may be different mappings in different script
     # blocks (LETTER RRA is mapped differently in Devanagari and Bengali),
     # in which case the name goes into %DECOMP_MISMATCH, and is removed
     # from %DECOMP.
     $NAMES = "NamesList.txt";
     open(NAMES);
     while (<NAMES>) {
         # Skip to start of DEVANAGARI block
         last if (/^\@\@\s+0900/);
     }
     while (<NAMES>) {
         # Continue until start of SINHALA block
         last if (/^\@\@\s+0D80/);
         if (/^([0-9A-Z]{4})/i) {
             $code = $1;
         } elsif (/^\s+:\s*(.+)/) {
             # We've found a mapping of the form:
             # 0929    DEVANAGARI LETTER NNNA
             #     * for transcribing Dravidian alveolar n
             #     : 0928 093C
             my $from = $FULLCODE_TO_NAME{hex($code)};
             my @to = map($FULLCODE_TO_NAME{hex($_)}, split(/\s+/, $1));
             if (exists $DECOMP{$from}) {
                 my $aref = $DECOMP{$from};
                 if (join(".", @$aref) ne join(".", @to)) {
                     print STDERR "ERROR: Decomp mismatch for $from\n";
                     print STDERR "     : $from = ", join(".", @$aref), "\n";
                     print STDERR "     : $from = ", join(".", @to), "\n";
                     $DECOMP_MISMATCH{$from} = 1;
                 }
             } else {
                 $DECOMP{$from} = \@to;
             }
         }
     }
     close(NAMES);
     # Remove mismatches
     foreach (keys %DECOMP_MISMATCH) {
         delete $DECOMP{$_};
     }
     if ($DUMP) {
         foreach (keys %DECOMP) {
             print "$_ = ", join(" + ", @{$DECOMP{$_}}), "\n";
         }
     }
 }

 # Count the total number of scripts

 $SCRIPT_COUNT = scalar keys %SCRIPT_TO_BASE;
 #print join("\n", sort keys %SCRIPT_TO_BASE), "\n";

 # Dump out the %NAME_CODE_TO_SCRIPTS map.

 if ($DUMP) {
     print "\nBY NAME:\n";
     foreach my $pass ((1, 2)) {
         print "\nBY NAME - SINGLETONS:\n" if ($pass eq 2);
         foreach my $name (sort keys %NAME_CODE_TO_SCRIPTS) {
             if ($pass eq 1) {
                 next if (1 >= $NAME_CODE_TO_SCRIPTS{$name}{count});
             } else {
                 next if (1 < $NAME_CODE_TO_SCRIPTS{$name}{count});
             }
             print "$name:";
             my $href = $NAME_CODE_TO_SCRIPTS{$name};
             foreach my $code (sort {$a <=> $b} keys %$href) {
                 next if ($code eq 'count');
                 my $aref = $href->{$code};
                 print " ", hex2($code), " (", formatScriptList($aref), ")";
             }
             print "\n";
         }
     }
 }

 # Create some transliterators, based on the scripts and the %NAME_CODE_TO_SCRIPTS
 # map.  Only use %NAME_CODE_TO_SCRIPTS entries with a count of 2 or more, that is,
 # names that occur in two or more scripts.  For those scripts where
 # the names occur, map both up to the InterIndic range, and down to
 # the target script.

 $INTERINDIC = 0xE000;
 $INTERINDIC_EXTRA = 0xE080;
 $INTERINDIC_EXTRA_NEXT = $INTERINDIC_EXTRA;

 # For each script, create a hash.  The hash has a key for each
 # code point, either within its block, or in the InterIndic block.
 # the value of the key is the mapping.

 # The script hashes are named %DEVANAGARI, etc., and referenced
 # with symbolic refs.

 @REMAP = ('s/\bSHORT\s+//i',
           's/\bCANDRA\s+//i',
           's/\bQA$/KA/i',
           's/\bKHHA$/KHA/i',
           's/\bGHHA$/GA/i',
           's/\bZA$/JA/i',
           's/\bFA$/PHA/i',
           's/\bVA$/BA/i',
           's/\bNNNA$/NA/i',
           's/\bRRA$/RA/i',
           's/\bLLLA$/LLA/i',
           's/\bLLLA$/LA/i',
           's/\bLLA$/LA/i',
           's/^A(.) LENGTH MARK$/VOWEL SIGN A$1/i',
           's/CANDRABINDU/BINDI/i',
           's/BINDI/CANDRABINDU/i',
           );

 # Do this so we see zero counts:
 foreach my $remap (@REMAP) { $REMAP{$remap} = 0; }

 # This loop iterates over the names in the NAME_CODE_TO_SCRIPTS hash.
 # These names are things like "LETTER NNNA".  For each name, it then
 # creates script mappings up to the InterIndic area, and back down
 # to the script areas.  If a name maps to more than one offset,
 # then it uses the InterIndic extra range.  Either way, it picks
 # a single InterIndic point, either an offset point or something in
 # the extra range, and maps up and down from that point.
 foreach my $name (sort keys %NAME_CODE_TO_SCRIPTS) {
     next if (1 >= $NAME_CODE_TO_SCRIPTS{$name}{count});
     my $href = $NAME_CODE_TO_SCRIPTS{$name};
     # Count the number of different codes assigned to this name.
     # Usually 1, but 2 for a handful of names.
     my $codeCount = (keys %{$NAME_CODE_TO_SCRIPTS{$name}}) - 1; # less 1: {count}
     # If $codeCount is 1, then map directly up to the $INTERINDIC
     # base.  If $codeCount is 2, then map into unused spots starting
     # at $INTERINDIC_EXTRA.
     my $interIndicCode;
     if ($codeCount > 1) {
         # Map into the InterIndic extra range
         $interIndicCode = $INTERINDIC_EXTRA_NEXT++;
     }
     my %seen;
     foreach my $code (sort {$a ne 'count' && $b ne 'count' && $a <=> $b} keys %$href) {
         next if ($code eq 'count');
         my $aref = $href->{$code}; # Ref to array of scripts
         if ($codeCount == 1) {
             # Map directly
             $interIndicCode = $INTERINDIC + $code;
         }
         # Keep track of the names of the extra InterIndic points
         $INTERINDIC_NAME_TO_FULLCODE{$name} = $interIndicCode;

         foreach my $scr (@$aref) {
             $seen{$scr} = 1;
             my $fullCode = $SCRIPT_TO_BASE{$scr} + $code;
             $ {$scr}{$fullCode} = hex4($interIndicCode) . "; // $name";
             $ {$scr}{$interIndicCode} = hex4($fullCode) . "; // $name";
         }
     }
     # Now handle InterIndic->Script unmapped points.  For each name,
     # some of the scripts will be left out -- will have no mappings
     # to that name.  For these scripts, we can either leave them
     # unmapped (so the InterIndic->Local mapping is empty), or
     # try to remap.
  unmappedScript:
     foreach my $scr (keys %SCRIPT_TO_BASE) {
         next if ($seen{$scr});

         if ($DO_HEURISTIC_REMAP) {
             # Try to remap through the known equivalences in our
             # remapping table
             foreach my $remapRE (@REMAP) {
                 local $_ = $name;
                 if (eval($remapRE)) {
                     if (exists $SCRIPT_NAME_TO_CODE{$scr}{$_}) {
                         $ {$scr}{$interIndicCode} =
                             hex4($SCRIPT_TO_BASE{$scr} + $SCRIPT_NAME_TO_CODE{$scr}{$_}) .
                                 "; // REMAP: $name -> $_";
                         ++$REMAP{$remapRE};
                         next unmappedScript;
                     }
                 }
             }
         }

         # Try to remap through the file.  This contains remappings of
         # the form 0991->0993.  That is, it contains local remappings
         # that we can substitute and try again with.
         #|GURMUKHI-InterIndic ------------------------------
         #|// 0A02>; // UNMAPPED INTERNAL: SIGN BINDI
         #|InterIndic-GURMUKHI ------------------------------
         #|// E001>; // UNMAPPED EXTERNAL: SIGN CANDRABINDU
         #|"\u0A01>\u0A02;"
         # In this example, the remapping tells us that the non-existent
         # character A01 should be considered equivalent to the real
         # character A02.
         # We implement this by adding two mappings; one from
         # the InterIndic equivalent of A01, that is, E001, to A02,
         # and one from A02, which otherwise has no mapping, to E001.
         if ($EXCEPTIONS_FILE && $interIndicCode < $INTERINDIC_EXTRA) {
             # Try to map this InterIndic character back to a the spot
             # it would occupy in this script if it had a mapping.
             my $code = $interIndicCode & 0x7F;
             my $pseudoFullCode = $SCRIPT_TO_BASE{$scr} + $code;
             if (exists $EXCEPTIONS{$pseudoFullCode}) {
                 my $fullCodeArray = $EXCEPTIONS{$pseudoFullCode};
                 my $comment;
                 foreach my $c (@$fullCodeArray) {
                     $comment .= "." if ($comment);
                     $comment .= $FULLCODE_TO_NAME{$c};
                 }
                 $comment = "; // REMAP ($EXCEPTIONS_FILE): " .
                     hex4($pseudoFullCode) . ">" . hexArray(@$fullCodeArray) . " = " .
                     $CODE_TO_NAME{$code} . ">" . $comment;
                 $ {$scr}{$interIndicCode} = hexArray(@$fullCodeArray) . $comment;
                 if (scalar @$fullCodeArray == 1) {
                     if (exists $ {$scr}{$fullCodeArray->[0]}) {
                         # There's already a proper mapping; no need to fill
                         # in reverse
                     } else {
                         $ {$scr}{$fullCodeArray->[0]} = hex4($interIndicCode) . $comment;
                     }
                 }
                 next unmappedScript;
             }
         }

         $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$interIndicCode} = 1;
         local $_ = "; // UNMAPPED InterIndic-$scr: $name";
         if (exists $SCRIPT_CODE_TO_NAME{$scr}{$interIndicCode & 0x7F}) {
             my $fullCode = $SCRIPT_TO_BASE{$scr} + ($interIndicCode & 0x7F);
             $_ .= " (" . hex4($fullCode) . " = " . $FULLCODE_TO_NAME{$fullCode} . ")";
         }
         $ {$scr}{$interIndicCode} = $_;
     }
 }

 # Add in unmapped entries for each script
 foreach my $scr (keys %SCRIPT_TO_BASE) {
     my $base = $SCRIPT_TO_BASE{$scr};
  unmappedInt:
     foreach my $code (keys %{$SCRIPT_CODE_TO_NAME{$scr}}) {
         my $fullCode = $code + $base;
         next if (exists $ {$scr}{$fullCode});
         my $name = $SCRIPT_CODE_TO_NAME{$scr}{$code};

         if ($DO_HEURISTIC_REMAP) {
             foreach my $remapRE (@REMAP) {
                 local $_ = $name;
                 if (eval($remapRE)) {
                     if (exists $INTERINDIC_NAME_TO_FULLCODE{$_}) {
                         $ {$scr}{$fullCode} =
                             hex4($INTERINDIC_NAME_TO_FULLCODE{$_}) .
                                 "; // REMAP: $name -> $_";
                         ++$REMAP{$remapRE};
                         next unmappedInt;
                     }
                 }
             }
         }

         # Now try the decomp table
         if ($DO_DECOMP_REMAP && exists $DECOMP{$name}) {
             my $x;
             my $cmt = "; // DECOMP: $name -> ";
             foreach my $n (@{$DECOMP{$name}}) {
                 if (exists $SCRIPT_NAME_TO_CODE{$scr}{$n}) {
                     $x .= hex4($SCRIPT_TO_BASE{$scr} + $SCRIPT_NAME_TO_CODE{$scr}{$n});
                     $cmt .= $n . " + ";
                 } else {
                     $cmt = 0;
                     last;
                 }
             }
             if ($cmt) {
                 $ {$scr}{$fullCode} = $x . $cmt;
                 next unmappedInt;
             }
         }

         $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode} = 1;
         $ {$scr}{$fullCode} = "; // UNMAPPED $scr-InterIndic: $name";
     }
 }

 # GUR
 # E00B>; // UNMAPPED EXTERNAL: LETTER VOCALIC R "\u0A0B>\u0A30\u0A3F;"
 # E00C>; // UNMAPPED EXTERNAL: LETTER VOCALIC L "\u0A0C>\u0A07;"
 # E00D>; // UNMAPPED EXTERNAL: LETTER CANDRA E "\u0A0D>\u0A10;"
 # E011>; // UNMAPPED EXTERNAL: LETTER CANDRA O "\u0A11>\u0A14;"
 # E037>; // UNMAPPED EXTERNAL: LETTER SSA "\u0A37>\u0A36;"
 # E045>; // UNMAPPED EXTERNAL: VOWEL SIGN CANDRA E "\u0A45>\u0A48;"
 # E049>; // UNMAPPED EXTERNAL: VOWEL SIGN CANDRA O "\u0A49>\u0A4C;"
 # Fix QA too

 # Dump out script maps
 foreach my $scr (sort keys %SCRIPT_TO_BASE) {
     ## next unless ($scr eq 'TELUGU'); # Debugging
     my @rules;
     my $flag = 1;
     foreach my $fullCode (sort {$a <=> $b} keys %{$scr}) {
         if ($flag && $fullCode >= $INTERINDIC) {
             # We have the complete <scr>-InterIndic rules; dump
             # them out.
             generateRBT($scr, "InterIndic", \@rules, $OUTDIR);
             @rules = ();
             $flag = 0;
         }
         if (exists $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode}) {
             push @rules, "// " . hex4($fullCode) . ">" . $ {$scr}{$fullCode};
         } else {
             push @rules, hex4($fullCode) . ">" . $ {$scr}{$fullCode};
         }
     }
     # Now generate the InterIndic-<scr> rules.
     generateRBT("InterIndic", $scr, \@rules, $OUTDIR);

 #    print "$scr-InterIndic ------------------------------\n";
 #    my $flag = 1;
 #    foreach my $fullCode (sort {$a <=> $b} keys %{$scr}) {
 #        if ($flag && $fullCode >= $INTERINDIC) {
 #            print "InterIndic-$scr ------------------------------\n";
 #            $flag = 0;
 #        }
 #        if (exists $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode}) {
 #            print "// ", hex4($fullCode), ">", $ {$scr}{$fullCode}, "\n";
 #        } else {
 #            print hex4($fullCode), ">", $ {$scr}{$fullCode}, "\n";
 #        }
 #    }
 }

 # List successful remappings
 if ($DO_HEURISTIC_REMAP) {
     foreach my $remap (sort keys %REMAP) {
         print STDERR "REMAP ", $REMAP{$remap}, " x $remap\n";
     }
 }

 #----------------------------------------------------------------------
 # SUBROUTINES

 # Return a listing of an array of scripts
 # Param: array ref
 sub formatScriptList {
     my $aref = shift;
     if ($SCRIPT_COUNT == @$aref) {
         return "all";
     } elsif (($SCRIPT_COUNT - 3) <= @$aref) {
         my $s = "all but";
         my %temp;
         foreach (@$aref) { $temp{$_} = 1; }
         foreach (sort keys %SCRIPT_TO_BASE) {
             $s .= " $_" unless exists $temp{$_};
         }
         return $s;
     } else {
         return join(" ", @$aref);
     }
 }

 # Format as %02X hex
 sub hex2 {
     sprintf("%02X", $_[0]);
 }

 # Format as %04X hex
 sub hex4 {
     sprintf("\\u%04X", $_[0]);
 }

 # Format an array as %04X hex, delimited by "."s
 sub hexArray {
     join("", map { hex4($_); } @_);
 }

 # Parse a string of the form "\u0D01" to an array of integers.
 # Must ONLY contain escapes.
 # Return the array.
 sub parseUnicodeEscape {
     local $_ = shift;
     my $orig = $_;
     my @result;
     while (length($_)) {
         if (/^\\u([0-9a-f]{4})(.*)/i) {
             push @result, hex($1);
             $_ = $2;
         } else {
             die "Can't parse Unicode escape $orig\n";
         }
     }
     if (0 == @result) {
         die "Can't parse Unicode escape $orig\n";
     }
     @result;
 }

 # Return 1 if the two arrays of ints are equal.
 # Param: ref to array of ints
 # Param: ref to array of ints
 sub intArraysEqual {
     my $a = shift;
     my $b = shift;
     if (scalar @$a == scalar @$b) {
         for (my $i=0; $i<@$a; ++$i) {
             if ($a->[$i] != $b->[$i]) {
                 return 0;
             }
         }
         return 1;
     }
     return 0;
 }

 # Given a rule, possibly with trailing // comment,
 # quote the rule part and add a trailing "+" after
 # it.
 sub quoteRule {
     my $cmt;
     $cmt = $1 if (s|(\s*//.*)||); # isolate trailing // comment
     s/^(.*;)/\"$1\"+/;
     s/$/$cmt/;
     $_;
 }

 # Given the name of the source script, name of the target script,
 # and array of rule strings, return a string containing the source
 # for a RuleBasedTransliterator file.
 # Param: source script name
 # Param: target script name
 # Param: ref to array of rules.  These rules are unquoted, without
 #  concatenators between them, but do have trailing ';' separators.
 # Param: name of output directory
 sub generateRBT {
     # $TOOL - name of generating tool
     # $DATE - date of generation
     # $SCRIPTFROM - name of source script
     # $SCRIPTTO - name of target script
     # $RULES - rules
     my ($source, $target, $rules, $outdir) = @_;
     my $text;
     $outdir =~ s|[/\\]$||; # Delete trailing / or \
     my $OUT = "$outdir/TransliterationRule_${source}_$target.java";
     open(RBT_TEMPLATE) or die;
     open(OUT, ">$OUT") or die;
     while (<RBT_TEMPLATE>) {
         while (/\$([A-Za-z0-9]+)/) {
             my $tag = $1;
             my $sub;
             if ($tag eq 'TOOL') {
                 $sub = $RBT_GEN_TOOL;
             } elsif ($tag eq 'DATE') {
                 $sub = localtime;
             } elsif ($tag eq 'SCRIPTFROM') {
                 $sub = $source;
             } elsif ($tag eq 'SCRIPTTO') {
                 $sub = $target;
             } elsif ($tag eq 'RULES') {
                 # Get any whitespace-only indent off the front of this tag
                 my $indent;
                 $indent = $1 if (/^(\s+)\$$tag/);

                 # The rules in the array are not quoted.  We need to quote
                 # them and add '+' operators between them.  We do NOT need
                 # to add ';' separators.  We DO need to separate trailing
                 # // comments and handle them.
                 $sub = join("\n$indent", map(&quoteRule, @$rules)) .
                     "\n$indent\"\"";
             } else {
                 print STDERR "ERROR in $RBT_TEMPLATE: Unknown tag $tag\n";
                 $sub = "[ERROR:Unknown tag \$$tag]";
             }
             s/\$$tag/$sub/;
         }
         print OUT;
     }
     close(OUT);
     close(RBT_TEMPLATE);
     print STDERR "Written: $OUT\n";
 }

 __END__
	#/**
	# *******************************************************************************
	# * Copyright (C) 2000-2004, International Business Machines Corporation and *
	# * others. All Rights Reserved. *
	# *******************************************************************************
	# */

	#!perl

	# Usage - $0 <remap file>
	# e.g. - indic indic.txt
	# The input file should be a subset of the Unicode data file containing
	# the blocks of interest.
	#
	# The remap file should have lines of the form
	# "\u0D01>\u0D02;"
	# including the quotes. These will be interpreted as saying that the
	# undefined code point U+D01 (derived via mapping from InterIndic)
	# can be remapped to U+D02.
	#
	# The purpose of this script is to process the Indic script data into
	# a form usable by the IndicTransliterator, that is, the Indic-Indic
	# transliterator. The transliterator needs two things: A mapping of
	# the code points in common, and a list of the exceptions.

	# Assume we are located in icu4j/src/com/ibm/tools/translit/.
	# We want the Unicode DB in icu4j/src/data/unicode/.
	$UNICODE_DB = "../../../../data/unicode/UnicodeData.txt";
	$EXCEPTIONS_FILE = shift;

	# Assume we are located in icu4j/src/com/ibm/tools/translit/.
	# We want to output files to icu4j/src/com/ibm/text/resources/.
	# Output directory
	$OUTDIR = "../../text/resources";

	# The template file should contain java code that can be used
	# to generate RuleBasedTransliterator resource files. The template
	# should contain the following embedded symbols, which this script
	# will replace:
	# $TOOL - name of generating tool
	# $DATE - date of generation
	# $SCRIPTFROM - name of source script
	# $SCRIPTTO - name of target script
	# $RULES - rules
	$RBT_TEMPLATE = 'rbtTemplate.txt';

	# Name of this tool in generated RBT files
	$RBT_GEN_TOOL = 'icu4j/src/com/ibm/tools/translit/indic.pl';

	$DUMP = 0; # If 1, dump out internal data

	$DO_HEURISTIC_REMAP = 0; # If 1, do automatic heuristic remapping
	$DO_DECOMP_REMAP = 0; # If 1, do decomp remapping

	open(UNICODE_DB);
	while (<UNICODE_DB>) {
	next if (m\|^0[0-8]\|); # Skip up to Devanagari block (0900)
	last if (m\|^0D[8-F]\|i); # Bail out after Malayam block (0D00)
	# 0D39;MALAYALAM LETTER HA;Lo;0;L;;;;;N;;;;;
	my @data = split(/;/);
	my $fullCode = hex($data[0]); # e.g., 0x093F
	my $code = $fullCode & 0x7F; # e.g., 0x3F
	my ($script, $name) = ($data[1] =~ /(\w+)\s+(.+)/);
	die "Can't parse $_" unless ($name);
	# e.g., $code/$script/$name = 3F/MALAYALAM/VOWEL SIGN I

	# Titlecase the script
	$script = ucfirst(lc($script));

	# Fix a couple inconsistencies in the 3.0 data
	# REVISIT: Is this okay to do?
	if ($DO_HEURISTIC_REMAP) {
	if ($script eq 'Gujarati' && $code >= 5 && $code <= 0x14) {
	$name =~ s/^VOWEL/LETTER/;
	}
	}

	# Keep track of all script names we encounter. We also note the
	# base of the block.
	my $base = $fullCode & ~0x7F; # e.g., 0x900;
	if (exists $SCRIPT_TO_BASE{$script}) {
	die "Script base mismatch for $script: $base vs. $SCRIPT_TO_BASE{$script}"
	if ($SCRIPT_TO_BASE{$script} ne $base);
	} else {
	$SCRIPT_TO_BASE{$script} = $base;
	}

	# Build up a mapping by name. For each name, keep a hash keyed by
	# code point. For each code point, keep an array of script names.
	# Also keep a total use count for each name.
	push @{$NAME_CODE_TO_SCRIPTS{$name}{$code}}, $script;
	++$NAME_CODE_TO_SCRIPTS{$name}{count};

	# Build a map that looks like this:
	# $SCRIPT_NAME_TO_CODE{<script>}{<name>} = <code>
	# or undef if there is no mapping.
	$SCRIPT_NAME_TO_CODE{$script}{$name} = $code;

	# Build a map that looks like this:
	$SCRIPT_CODE_TO_NAME{$script}{$code} = $name;

	# And a map from the fullCode point to the name
	$FULLCODE_TO_NAME{$fullCode} = $name;

	# Map code (0..7F) to name. This is usually a 1-1 mapping, but
	# is 1-n in a few cases.
	if (exists $CODE_TO_NAME{$code}) {
	if ($name ne $CODE_TO_NAME{$code}) {
	# For multiple names on a code offset, use the format
	# (a/b), (a/b/c), etc.
	local $_ = $CODE_TO_NAME{$code};
	if (m\|^\(\|) {
	if (!m\|[\(\)/]$name[\(\)/]\|) {
	s\|\)$\|/$name\)\|;
	}
	} else {
	$_ = "($_/$name)";
	}
	$CODE_TO_NAME{$code} = $_;
	}
	} else {
	$CODE_TO_NAME{$code} = $name;
	}
	}
	close(UNICODE_DB);

	# Read and parse the manual remapping file. This contains lines
	# of the form:

	# \|"\u0956>\u0948;" // AI Length Mark -> Devanagari Vowel Sign AI

	# The left hand side contains a non-existent full code value. It
	# should be a single value. The right hand side contains one or more
	# real full code values. The idea is that when a mapping from another
	# script ends up at the non-existent code point on the left, the
	# sequence on the right should be substituted. In this example,
	# Devanagari has no AI Length Mark. So, if transliterating from
	# Oriya, then the character 0B56 (Oriya AI Length Mark) will remap to
	# the non-existent 0956, and that remaps to 0948, our chosen
	# Devanagari equivalent. For our purposes, the left hand side should
	# be taken to mean its equivalent point in the InterIndic range. In
	# this example, what it really says is E056>0948 in the
	# InterIndic-Devanagari transliterator.

	if ($EXCEPTIONS_FILE) {
	open(EXCEPTIONS_FILE) or die;
	while (<EXCEPTIONS_FILE>) {
	if (m\|^\s\"([^\"]?)\"\|) {
	my $line = $_;
	$_ = $1;
	if (/^(.)>(.);$/) {
	my ($rawFrom, $rawTo) = ($1, $2);
	my @from = parseUnicodeEscape($rawFrom);
	my @to = parseUnicodeEscape($rawTo);
	my $from = hexArray(@from);
	# Some entries look like this:
	# \|"\u0955>\u0955;"
	# these do nothing; ignore them.
	if (intArraysEqual(\@from, \@to)) {
	#print STDERR "Ignoring NOOP remap of $from\n";
	} elsif (exists $EXCEPTIONS{$from}) {
	print STDERR "ERROR in $EXCEPTIONS_FILE - Duplicate remap entries for $from\n";
	} elsif (scalar @from > 1) {
	print STDERR "ERROR in $EXCEPTIONS_FILE - Ignoring multichar remap: ", hexArray(@from), "->", hexArray(@to), "\n";
	} else {
	# Check this for validity. Full code on the left
	# should NOT exist. Full code seq on the right should.
	if (exists $FULLCODE_TO_NAME{$from[0]}) {
	print STDERR "ERROR in $EXCEPTIONS_FILE - Invalid remap; left side defined: ", hexArray(@from), "->", hexArray(@to), "\n";
	} elsif (grep(! exists $FULLCODE_TO_NAME{$_}, @to)) {
	print STDERR "ERROR in $EXCEPTIONS_FILE - Invalid remap; right side undefined: ", hexArray(@from), "->", hexArray(@to), "\n";
	} else {
	$EXCEPTIONS{$from[0]} = \@to;
	}
	}
	} else { die "ERROR in $EXCEPTIONS_FILE - Can't parse \"$_\" in line $line"; }
	}
	}
	close(EXCEPTIONS_FILE);
	print STDERR "$EXCEPTIONS_FILE: Loaded ", scalar keys %EXCEPTIONS, " remappings\n";
	}

	if ($DO_DECOMP_REMAP) {
	# Read the NamesList.txt file. This contains decomposition data.
	# Gather these into %DECOMP, which maps a name to n1.n2..., where n1
	# etc. are decomposed names. E.g. $DECOMP{'LETTER RRA'} -> 'LETTER
	# RA.SIGN NUKTA'. There may be different mappings in different script
	# blocks (LETTER RRA is mapped differently in Devanagari and Bengali),
	# in which case the name goes into %DECOMP_MISMATCH, and is removed
	# from %DECOMP.
	$NAMES = "NamesList.txt";
	open(NAMES);
	while (<NAMES>) {
	# Skip to start of DEVANAGARI block
	last if (/^\@\@\s+0900/);
	}
	while (<NAMES>) {
	# Continue until start of SINHALA block
	last if (/^\@\@\s+0D80/);
	if (/^([0-9A-Z]{4})/i) {
	$code = $1;
	} elsif (/^\s+:\s*(.+)/) {
	# We've found a mapping of the form:
	# 0929 DEVANAGARI LETTER NNNA
	# * for transcribing Dravidian alveolar n
	# : 0928 093C
	my $from = $FULLCODE_TO_NAME{hex($code)};
	my @to = map($FULLCODE_TO_NAME{hex($_)}, split(/\s+/, $1));
	if (exists $DECOMP{$from}) {
	my $aref = $DECOMP{$from};
	if (join(".", @$aref) ne join(".", @to)) {
	print STDERR "ERROR: Decomp mismatch for $from\n";
	print STDERR " : $from = ", join(".", @$aref), "\n";
	print STDERR " : $from = ", join(".", @to), "\n";
	$DECOMP_MISMATCH{$from} = 1;
	}
	} else {
	$DECOMP{$from} = \@to;
	}
	}
	}
	close(NAMES);
	# Remove mismatches
	foreach (keys %DECOMP_MISMATCH) {
	delete $DECOMP{$_};
	}
	if ($DUMP) {
	foreach (keys %DECOMP) {
	print "$_ = ", join(" + ", @{$DECOMP{$_}}), "\n";
	}
	}
	}

	# Count the total number of scripts

	$SCRIPT_COUNT = scalar keys %SCRIPT_TO_BASE;
	#print join("\n", sort keys %SCRIPT_TO_BASE), "\n";

	# Dump out the %NAME_CODE_TO_SCRIPTS map.

	if ($DUMP) {
	print "\nBY NAME:\n";
	foreach my $pass ((1, 2)) {
	print "\nBY NAME - SINGLETONS:\n" if ($pass eq 2);
	foreach my $name (sort keys %NAME_CODE_TO_SCRIPTS) {
	if ($pass eq 1) {
	next if (1 >= $NAME_CODE_TO_SCRIPTS{$name}{count});
	} else {
	next if (1 < $NAME_CODE_TO_SCRIPTS{$name}{count});
	}
	print "$name:";
	my $href = $NAME_CODE_TO_SCRIPTS{$name};
	foreach my $code (sort {$a <=> $b} keys %$href) {
	next if ($code eq 'count');
	my $aref = $href->{$code};
	print " ", hex2($code), " (", formatScriptList($aref), ")";
	}
	print "\n";
	}
	}
	}

	# Create some transliterators, based on the scripts and the %NAME_CODE_TO_SCRIPTS
	# map. Only use %NAME_CODE_TO_SCRIPTS entries with a count of 2 or more, that is,
	# names that occur in two or more scripts. For those scripts where
	# the names occur, map both up to the InterIndic range, and down to
	# the target script.

	$INTERINDIC = 0xE000;
	$INTERINDIC_EXTRA = 0xE080;
	$INTERINDIC_EXTRA_NEXT = $INTERINDIC_EXTRA;

	# For each script, create a hash. The hash has a key for each
	# code point, either within its block, or in the InterIndic block.
	# the value of the key is the mapping.

	# The script hashes are named %DEVANAGARI, etc., and referenced
	# with symbolic refs.

	@REMAP = ('s/\bSHORT\s+//i',
	's/\bCANDRA\s+//i',
	's/\bQA$/KA/i',
	's/\bKHHA$/KHA/i',
	's/\bGHHA$/GA/i',
	's/\bZA$/JA/i',
	's/\bFA$/PHA/i',
	's/\bVA$/BA/i',
	's/\bNNNA$/NA/i',
	's/\bRRA$/RA/i',
	's/\bLLLA$/LLA/i',
	's/\bLLLA$/LA/i',
	's/\bLLA$/LA/i',
	's/^A(.) LENGTH MARK$/VOWEL SIGN A$1/i',
	's/CANDRABINDU/BINDI/i',
	's/BINDI/CANDRABINDU/i',
	);

	# Do this so we see zero counts:
	foreach my $remap (@REMAP) { $REMAP{$remap} = 0; }

	# This loop iterates over the names in the NAME_CODE_TO_SCRIPTS hash.
	# These names are things like "LETTER NNNA". For each name, it then
	# creates script mappings up to the InterIndic area, and back down
	# to the script areas. If a name maps to more than one offset,
	# then it uses the InterIndic extra range. Either way, it picks
	# a single InterIndic point, either an offset point or something in
	# the extra range, and maps up and down from that point.
	foreach my $name (sort keys %NAME_CODE_TO_SCRIPTS) {
	next if (1 >= $NAME_CODE_TO_SCRIPTS{$name}{count});
	my $href = $NAME_CODE_TO_SCRIPTS{$name};
	# Count the number of different codes assigned to this name.
	# Usually 1, but 2 for a handful of names.
	my $codeCount = (keys %{$NAME_CODE_TO_SCRIPTS{$name}}) - 1; # less 1: {count}
	# If $codeCount is 1, then map directly up to the $INTERINDIC
	# base. If $codeCount is 2, then map into unused spots starting
	# at $INTERINDIC_EXTRA.
	my $interIndicCode;
	if ($codeCount > 1) {
	# Map into the InterIndic extra range
	$interIndicCode = $INTERINDIC_EXTRA_NEXT++;
	}
	my %seen;
	foreach my $code (sort {$a ne 'count' && $b ne 'count' && $a <=> $b} keys %$href) {
	next if ($code eq 'count');
	my $aref = $href->{$code}; # Ref to array of scripts
	if ($codeCount == 1) {
	# Map directly
	$interIndicCode = $INTERINDIC + $code;
	}
	# Keep track of the names of the extra InterIndic points
	$INTERINDIC_NAME_TO_FULLCODE{$name} = $interIndicCode;

	foreach my $scr (@$aref) {
	$seen{$scr} = 1;
	my $fullCode = $SCRIPT_TO_BASE{$scr} + $code;
	$ {$scr}{$fullCode} = hex4($interIndicCode) . "; // $name";
	$ {$scr}{$interIndicCode} = hex4($fullCode) . "; // $name";
	}
	}
	# Now handle InterIndic->Script unmapped points. For each name,
	# some of the scripts will be left out -- will have no mappings
	# to that name. For these scripts, we can either leave them
	# unmapped (so the InterIndic->Local mapping is empty), or
	# try to remap.
	unmappedScript:
	foreach my $scr (keys %SCRIPT_TO_BASE) {
	next if ($seen{$scr});

	if ($DO_HEURISTIC_REMAP) {
	# Try to remap through the known equivalences in our
	# remapping table
	foreach my $remapRE (@REMAP) {
	local $_ = $name;
	if (eval($remapRE)) {
	if (exists $SCRIPT_NAME_TO_CODE{$scr}{$_}) {
	$ {$scr}{$interIndicCode} =
	hex4($SCRIPT_TO_BASE{$scr} + $SCRIPT_NAME_TO_CODE{$scr}{$_}) .
	"; // REMAP: $name -> $_";
	++$REMAP{$remapRE};
	next unmappedScript;
	}
	}
	}
	}

	# Try to remap through the file. This contains remappings of
	# the form 0991->0993. That is, it contains local remappings
	# that we can substitute and try again with.
	#\|GURMUKHI-InterIndic ------------------------------
	#\|// 0A02>; // UNMAPPED INTERNAL: SIGN BINDI
	#\|InterIndic-GURMUKHI ------------------------------
	#\|// E001>; // UNMAPPED EXTERNAL: SIGN CANDRABINDU
	#\|"\u0A01>\u0A02;"
	# In this example, the remapping tells us that the non-existent
	# character A01 should be considered equivalent to the real
	# character A02.
	# We implement this by adding two mappings; one from
	# the InterIndic equivalent of A01, that is, E001, to A02,
	# and one from A02, which otherwise has no mapping, to E001.
	if ($EXCEPTIONS_FILE && $interIndicCode < $INTERINDIC_EXTRA) {
	# Try to map this InterIndic character back to a the spot
	# it would occupy in this script if it had a mapping.
	my $code = $interIndicCode & 0x7F;
	my $pseudoFullCode = $SCRIPT_TO_BASE{$scr} + $code;
	if (exists $EXCEPTIONS{$pseudoFullCode}) {
	my $fullCodeArray = $EXCEPTIONS{$pseudoFullCode};
	my $comment;
	foreach my $c (@$fullCodeArray) {
	$comment .= "." if ($comment);
	$comment .= $FULLCODE_TO_NAME{$c};
	}
	$comment = "; // REMAP ($EXCEPTIONS_FILE): " .
	hex4($pseudoFullCode) . ">" . hexArray(@$fullCodeArray) . " = " .
	$CODE_TO_NAME{$code} . ">" . $comment;
	$ {$scr}{$interIndicCode} = hexArray(@$fullCodeArray) . $comment;
	if (scalar @$fullCodeArray == 1) {
	if (exists $ {$scr}{$fullCodeArray->[0]}) {
	# There's already a proper mapping; no need to fill
	# in reverse
	} else {
	$ {$scr}{$fullCodeArray->[0]} = hex4($interIndicCode) . $comment;
	}
	}
	next unmappedScript;
	}
	}

	$SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$interIndicCode} = 1;
	local $_ = "; // UNMAPPED InterIndic-$scr: $name";
	if (exists $SCRIPT_CODE_TO_NAME{$scr}{$interIndicCode & 0x7F}) {
	my $fullCode = $SCRIPT_TO_BASE{$scr} + ($interIndicCode & 0x7F);
	$_ .= " (" . hex4($fullCode) . " = " . $FULLCODE_TO_NAME{$fullCode} . ")";
	}
	$ {$scr}{$interIndicCode} = $_;
	}
	}

	# Add in unmapped entries for each script
	foreach my $scr (keys %SCRIPT_TO_BASE) {
	my $base = $SCRIPT_TO_BASE{$scr};
	unmappedInt:
	foreach my $code (keys %{$SCRIPT_CODE_TO_NAME{$scr}}) {
	my $fullCode = $code + $base;
	next if (exists $ {$scr}{$fullCode});
	my $name = $SCRIPT_CODE_TO_NAME{$scr}{$code};

	if ($DO_HEURISTIC_REMAP) {
	foreach my $remapRE (@REMAP) {
	local $_ = $name;
	if (eval($remapRE)) {
	if (exists $INTERINDIC_NAME_TO_FULLCODE{$_}) {
	$ {$scr}{$fullCode} =
	hex4($INTERINDIC_NAME_TO_FULLCODE{$_}) .
	"; // REMAP: $name -> $_";
	++$REMAP{$remapRE};
	next unmappedInt;
	}
	}
	}
	}

	# Now try the decomp table
	if ($DO_DECOMP_REMAP && exists $DECOMP{$name}) {
	my $x;
	my $cmt = "; // DECOMP: $name -> ";
	foreach my $n (@{$DECOMP{$name}}) {
	if (exists $SCRIPT_NAME_TO_CODE{$scr}{$n}) {
	$x .= hex4($SCRIPT_TO_BASE{$scr} + $SCRIPT_NAME_TO_CODE{$scr}{$n});
	$cmt .= $n . " + ";
	} else {
	$cmt = 0;
	last;
	}
	}
	if ($cmt) {
	$ {$scr}{$fullCode} = $x . $cmt;
	next unmappedInt;
	}
	}

	$SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode} = 1;
	$ {$scr}{$fullCode} = "; // UNMAPPED $scr-InterIndic: $name";
	}
	}

	# GUR
	# E00B>; // UNMAPPED EXTERNAL: LETTER VOCALIC R "\u0A0B>\u0A30\u0A3F;"
	# E00C>; // UNMAPPED EXTERNAL: LETTER VOCALIC L "\u0A0C>\u0A07;"
	# E00D>; // UNMAPPED EXTERNAL: LETTER CANDRA E "\u0A0D>\u0A10;"
	# E011>; // UNMAPPED EXTERNAL: LETTER CANDRA O "\u0A11>\u0A14;"
	# E037>; // UNMAPPED EXTERNAL: LETTER SSA "\u0A37>\u0A36;"
	# E045>; // UNMAPPED EXTERNAL: VOWEL SIGN CANDRA E "\u0A45>\u0A48;"
	# E049>; // UNMAPPED EXTERNAL: VOWEL SIGN CANDRA O "\u0A49>\u0A4C;"
	# Fix QA too

	# Dump out script maps
	foreach my $scr (sort keys %SCRIPT_TO_BASE) {
	## next unless ($scr eq 'TELUGU'); # Debugging
	my @rules;
	my $flag = 1;
	foreach my $fullCode (sort {$a <=> $b} keys %{$scr}) {
	if ($flag && $fullCode >= $INTERINDIC) {
	# We have the complete <scr>-InterIndic rules; dump
	# them out.
	generateRBT($scr, "InterIndic", \@rules, $OUTDIR);
	@rules = ();
	$flag = 0;
	}
	if (exists $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode}) {
	push @rules, "// " . hex4($fullCode) . ">" . $ {$scr}{$fullCode};
	} else {
	push @rules, hex4($fullCode) . ">" . $ {$scr}{$fullCode};
	}
	}
	# Now generate the InterIndic-<scr> rules.
	generateRBT("InterIndic", $scr, \@rules, $OUTDIR);

	# print "$scr-InterIndic ------------------------------\n";
	# my $flag = 1;
	# foreach my $fullCode (sort {$a <=> $b} keys %{$scr}) {
	# if ($flag && $fullCode >= $INTERINDIC) {
	# print "InterIndic-$scr ------------------------------\n";
	# $flag = 0;
	# }
	# if (exists $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode}) {
	# print "// ", hex4($fullCode), ">", $ {$scr}{$fullCode}, "\n";
	# } else {
	# print hex4($fullCode), ">", $ {$scr}{$fullCode}, "\n";
	# }
	# }
	}

	# List successful remappings
	if ($DO_HEURISTIC_REMAP) {
	foreach my $remap (sort keys %REMAP) {
	print STDERR "REMAP ", $REMAP{$remap}, " x $remap\n";
	}
	}

	#----------------------------------------------------------------------
	# SUBROUTINES

	# Return a listing of an array of scripts
	# Param: array ref
	sub formatScriptList {
	my $aref = shift;
	if ($SCRIPT_COUNT == @$aref) {
	return "all";
	} elsif (($SCRIPT_COUNT - 3) <= @$aref) {
	my $s = "all but";
	my %temp;
	foreach (@$aref) { $temp{$_} = 1; }
	foreach (sort keys %SCRIPT_TO_BASE) {
	$s .= " $_" unless exists $temp{$_};
	}
	return $s;
	} else {
	return join(" ", @$aref);
	}
	}

	# Format as %02X hex
	sub hex2 {
	sprintf("%02X", $_[0]);
	}

	# Format as %04X hex
	sub hex4 {
	sprintf("\\u%04X", $_[0]);
	}

	# Format an array as %04X hex, delimited by "."s
	sub hexArray {
	join("", map { hex4($_); } @_);
	}

	# Parse a string of the form "\u0D01" to an array of integers.
	# Must ONLY contain escapes.
	# Return the array.
	sub parseUnicodeEscape {
	local $_ = shift;
	my $orig = $_;
	my @result;
	while (length($_)) {
	if (/^\\u([0-9a-f]{4})(.*)/i) {
	push @result, hex($1);
	$_ = $2;
	} else {
	die "Can't parse Unicode escape $orig\n";
	}
	}
	if (0 == @result) {
	die "Can't parse Unicode escape $orig\n";
	}
	@result;
	}

	# Return 1 if the two arrays of ints are equal.
	# Param: ref to array of ints
	# Param: ref to array of ints
	sub intArraysEqual {
	my $a = shift;
	my $b = shift;
	if (scalar @$a == scalar @$b) {
	for (my $i=0; $i<@$a; ++$i) {
	if ($a->[$i] != $b->[$i]) {
	return 0;
	}
	}
	return 1;
	}
	return 0;
	}

	# Given a rule, possibly with trailing // comment,
	# quote the rule part and add a trailing "+" after
	# it.
	sub quoteRule {
	my $cmt;
	$cmt = $1 if (s\|(\s//.)\|\|); # isolate trailing // comment
	s/^(.*;)/\"$1\"+/;
	s/$/$cmt/;
	$_;
	}

	# Given the name of the source script, name of the target script,
	# and array of rule strings, return a string containing the source
	# for a RuleBasedTransliterator file.
	# Param: source script name
	# Param: target script name
	# Param: ref to array of rules. These rules are unquoted, without
	# concatenators between them, but do have trailing ';' separators.
	# Param: name of output directory
	sub generateRBT {
	# $TOOL - name of generating tool
	# $DATE - date of generation
	# $SCRIPTFROM - name of source script
	# $SCRIPTTO - name of target script
	# $RULES - rules
	my ($source, $target, $rules, $outdir) = @_;
	my $text;
	$outdir =~ s\|[/\\]$\|\|; # Delete trailing / or \
	my $OUT = "$outdir/TransliterationRule_${source}_$target.java";
	open(RBT_TEMPLATE) or die;
	open(OUT, ">$OUT") or die;
	while (<RBT_TEMPLATE>) {
	while (/\$([A-Za-z0-9]+)/) {
	my $tag = $1;
	my $sub;
	if ($tag eq 'TOOL') {
	$sub = $RBT_GEN_TOOL;
	} elsif ($tag eq 'DATE') {
	$sub = localtime;
	} elsif ($tag eq 'SCRIPTFROM') {
	$sub = $source;
	} elsif ($tag eq 'SCRIPTTO') {
	$sub = $target;
	} elsif ($tag eq 'RULES') {
	# Get any whitespace-only indent off the front of this tag
	my $indent;
	$indent = $1 if (/^(\s+)\$$tag/);

	# The rules in the array are not quoted. We need to quote
	# them and add '+' operators between them. We do NOT need
	# to add ';' separators. We DO need to separate trailing
	# // comments and handle them.
	$sub = join("\n$indent", map(&quoteRule, @$rules)) .
	"\n$indent\"\"";
	} else {
	print STDERR "ERROR in $RBT_TEMPLATE: Unknown tag $tag\n";
	$sub = "[ERROR:Unknown tag \$$tag]";
	}
	s/\$$tag/$sub/;
	}
	print OUT;
	}
	close(OUT);
	close(RBT_TEMPLATE);
	print STDERR "Written: $OUT\n";
	}

	__END__