| #/** |
| # ******************************************************************************* |
| # * Copyright (C) 2000-2004, International Business Machines Corporation and * |
| # * others. All Rights Reserved. * |
| # ******************************************************************************* |
| # */ |
| |
| #!perl |
| |
| # Usage - $0 <remap file> |
| # e.g. - indic indic.txt |
| # The input file should be a subset of the Unicode data file containing |
| # the blocks of interest. |
| # |
| # The remap file should have lines of the form |
| # "\u0D01>\u0D02;" |
| # including the quotes. These will be interpreted as saying that the |
| # undefined code point U+D01 (derived via mapping from InterIndic) |
| # can be remapped to U+D02. |
| # |
| # The purpose of this script is to process the Indic script data into |
| # a form usable by the IndicTransliterator, that is, the Indic-Indic |
| # transliterator. The transliterator needs two things: A mapping of |
| # the code points in common, and a list of the exceptions. |
| |
| # Assume we are located in icu4j/src/com/ibm/tools/translit/. |
| # We want the Unicode DB in icu4j/src/data/unicode/. |
| $UNICODE_DB = "../../../../data/unicode/UnicodeData.txt"; |
| $EXCEPTIONS_FILE = shift; |
| |
| # Assume we are located in icu4j/src/com/ibm/tools/translit/. |
| # We want to output files to icu4j/src/com/ibm/text/resources/. |
| # Output directory |
| $OUTDIR = "../../text/resources"; |
| |
| # The template file should contain java code that can be used |
| # to generate RuleBasedTransliterator resource files. The template |
| # should contain the following embedded symbols, which this script |
| # will replace: |
| # $TOOL - name of generating tool |
| # $DATE - date of generation |
| # $SCRIPTFROM - name of source script |
| # $SCRIPTTO - name of target script |
| # $RULES - rules |
| $RBT_TEMPLATE = 'rbtTemplate.txt'; |
| |
| # Name of this tool in generated RBT files |
| $RBT_GEN_TOOL = 'icu4j/src/com/ibm/tools/translit/indic.pl'; |
| |
| $DUMP = 0; # If 1, dump out internal data |
| |
| $DO_HEURISTIC_REMAP = 0; # If 1, do automatic heuristic remapping |
| $DO_DECOMP_REMAP = 0; # If 1, do decomp remapping |
| |
| open(UNICODE_DB); |
| while (<UNICODE_DB>) { |
| next if (m|^0[0-8]|); # Skip up to Devanagari block (0900) |
| last if (m|^0D[8-F]|i); # Bail out after Malayam block (0D00) |
| # 0D39;MALAYALAM LETTER HA;Lo;0;L;;;;;N;;;;; |
| my @data = split(/;/); |
| my $fullCode = hex($data[0]); # e.g., 0x093F |
| my $code = $fullCode & 0x7F; # e.g., 0x3F |
| my ($script, $name) = ($data[1] =~ /(\w+)\s+(.+)/); |
| die "Can't parse $_" unless ($name); |
| # e.g., $code/$script/$name = 3F/MALAYALAM/VOWEL SIGN I |
| |
| # Titlecase the script |
| $script = ucfirst(lc($script)); |
| |
| # Fix a couple inconsistencies in the 3.0 data |
| # REVISIT: Is this okay to do? |
| if ($DO_HEURISTIC_REMAP) { |
| if ($script eq 'Gujarati' && $code >= 5 && $code <= 0x14) { |
| $name =~ s/^VOWEL/LETTER/; |
| } |
| } |
| |
| # Keep track of all script names we encounter. We also note the |
| # base of the block. |
| my $base = $fullCode & ~0x7F; # e.g., 0x900; |
| if (exists $SCRIPT_TO_BASE{$script}) { |
| die "Script base mismatch for $script: $base vs. $SCRIPT_TO_BASE{$script}" |
| if ($SCRIPT_TO_BASE{$script} ne $base); |
| } else { |
| $SCRIPT_TO_BASE{$script} = $base; |
| } |
| |
| # Build up a mapping by name. For each name, keep a hash keyed by |
| # code point. For each code point, keep an array of script names. |
| # Also keep a total use count for each name. |
| push @{$NAME_CODE_TO_SCRIPTS{$name}{$code}}, $script; |
| ++$NAME_CODE_TO_SCRIPTS{$name}{count}; |
| |
| # Build a map that looks like this: |
| # $SCRIPT_NAME_TO_CODE{<script>}{<name>} = <code> |
| # or undef if there is no mapping. |
| $SCRIPT_NAME_TO_CODE{$script}{$name} = $code; |
| |
| # Build a map that looks like this: |
| $SCRIPT_CODE_TO_NAME{$script}{$code} = $name; |
| |
| # And a map from the fullCode point to the name |
| $FULLCODE_TO_NAME{$fullCode} = $name; |
| |
| # Map code (0..7F) to name. This is usually a 1-1 mapping, but |
| # is 1-n in a few cases. |
| if (exists $CODE_TO_NAME{$code}) { |
| if ($name ne $CODE_TO_NAME{$code}) { |
| # For multiple names on a code offset, use the format |
| # (a/b), (a/b/c), etc. |
| local $_ = $CODE_TO_NAME{$code}; |
| if (m|^\(|) { |
| if (!m|[\(\)/]$name[\(\)/]|) { |
| s|\)$|/$name\)|; |
| } |
| } else { |
| $_ = "($_/$name)"; |
| } |
| $CODE_TO_NAME{$code} = $_; |
| } |
| } else { |
| $CODE_TO_NAME{$code} = $name; |
| } |
| } |
| close(UNICODE_DB); |
| |
| # Read and parse the manual remapping file. This contains lines |
| # of the form: |
| |
| # |"\u0956>\u0948;" // AI Length Mark -> Devanagari Vowel Sign AI |
| |
| # The left hand side contains a non-existent full code value. It |
| # should be a single value. The right hand side contains one or more |
| # real full code values. The idea is that when a mapping from another |
| # script ends up at the non-existent code point on the left, the |
| # sequence on the right should be substituted. In this example, |
| # Devanagari has no AI Length Mark. So, if transliterating from |
| # Oriya, then the character 0B56 (Oriya AI Length Mark) will remap to |
| # the non-existent 0956, and that remaps to 0948, our chosen |
| # Devanagari equivalent. For our purposes, the left hand side should |
| # be taken to mean its equivalent point in the InterIndic range. In |
| # this example, what it really says is E056>0948 in the |
| # InterIndic-Devanagari transliterator. |
| |
| if ($EXCEPTIONS_FILE) { |
| open(EXCEPTIONS_FILE) or die; |
| while (<EXCEPTIONS_FILE>) { |
| if (m|^\s*\"([^\"]*?)\"|) { |
| my $line = $_; |
| $_ = $1; |
| if (/^(.*)>(.*);$/) { |
| my ($rawFrom, $rawTo) = ($1, $2); |
| my @from = parseUnicodeEscape($rawFrom); |
| my @to = parseUnicodeEscape($rawTo); |
| my $from = hexArray(@from); |
| # Some entries look like this: |
| # |"\u0955>\u0955;" |
| # these do nothing; ignore them. |
| if (intArraysEqual(\@from, \@to)) { |
| #print STDERR "Ignoring NOOP remap of $from\n"; |
| } elsif (exists $EXCEPTIONS{$from}) { |
| print STDERR "ERROR in $EXCEPTIONS_FILE - Duplicate remap entries for $from\n"; |
| } elsif (scalar @from > 1) { |
| print STDERR "ERROR in $EXCEPTIONS_FILE - Ignoring multichar remap: ", hexArray(@from), "->", hexArray(@to), "\n"; |
| } else { |
| # Check this for validity. Full code on the left |
| # should NOT exist. Full code seq on the right should. |
| if (exists $FULLCODE_TO_NAME{$from[0]}) { |
| print STDERR "ERROR in $EXCEPTIONS_FILE - Invalid remap; left side defined: ", hexArray(@from), "->", hexArray(@to), "\n"; |
| } elsif (grep(! exists $FULLCODE_TO_NAME{$_}, @to)) { |
| print STDERR "ERROR in $EXCEPTIONS_FILE - Invalid remap; right side undefined: ", hexArray(@from), "->", hexArray(@to), "\n"; |
| } else { |
| $EXCEPTIONS{$from[0]} = \@to; |
| } |
| } |
| } else { die "ERROR in $EXCEPTIONS_FILE - Can't parse \"$_\" in line $line"; } |
| } |
| } |
| close(EXCEPTIONS_FILE); |
| print STDERR "$EXCEPTIONS_FILE: Loaded ", scalar keys %EXCEPTIONS, " remappings\n"; |
| } |
| |
| if ($DO_DECOMP_REMAP) { |
| # Read the NamesList.txt file. This contains decomposition data. |
| # Gather these into %DECOMP, which maps a name to n1.n2..., where n1 |
| # etc. are decomposed names. E.g. $DECOMP{'LETTER RRA'} -> 'LETTER |
| # RA.SIGN NUKTA'. There may be different mappings in different script |
| # blocks (LETTER RRA is mapped differently in Devanagari and Bengali), |
| # in which case the name goes into %DECOMP_MISMATCH, and is removed |
| # from %DECOMP. |
| $NAMES = "NamesList.txt"; |
| open(NAMES); |
| while (<NAMES>) { |
| # Skip to start of DEVANAGARI block |
| last if (/^\@\@\s+0900/); |
| } |
| while (<NAMES>) { |
| # Continue until start of SINHALA block |
| last if (/^\@\@\s+0D80/); |
| if (/^([0-9A-Z]{4})/i) { |
| $code = $1; |
| } elsif (/^\s+:\s*(.+)/) { |
| # We've found a mapping of the form: |
| # 0929 DEVANAGARI LETTER NNNA |
| # * for transcribing Dravidian alveolar n |
| # : 0928 093C |
| my $from = $FULLCODE_TO_NAME{hex($code)}; |
| my @to = map($FULLCODE_TO_NAME{hex($_)}, split(/\s+/, $1)); |
| if (exists $DECOMP{$from}) { |
| my $aref = $DECOMP{$from}; |
| if (join(".", @$aref) ne join(".", @to)) { |
| print STDERR "ERROR: Decomp mismatch for $from\n"; |
| print STDERR " : $from = ", join(".", @$aref), "\n"; |
| print STDERR " : $from = ", join(".", @to), "\n"; |
| $DECOMP_MISMATCH{$from} = 1; |
| } |
| } else { |
| $DECOMP{$from} = \@to; |
| } |
| } |
| } |
| close(NAMES); |
| # Remove mismatches |
| foreach (keys %DECOMP_MISMATCH) { |
| delete $DECOMP{$_}; |
| } |
| if ($DUMP) { |
| foreach (keys %DECOMP) { |
| print "$_ = ", join(" + ", @{$DECOMP{$_}}), "\n"; |
| } |
| } |
| } |
| |
| # Count the total number of scripts |
| |
| $SCRIPT_COUNT = scalar keys %SCRIPT_TO_BASE; |
| #print join("\n", sort keys %SCRIPT_TO_BASE), "\n"; |
| |
| # Dump out the %NAME_CODE_TO_SCRIPTS map. |
| |
| if ($DUMP) { |
| print "\nBY NAME:\n"; |
| foreach my $pass ((1, 2)) { |
| print "\nBY NAME - SINGLETONS:\n" if ($pass eq 2); |
| foreach my $name (sort keys %NAME_CODE_TO_SCRIPTS) { |
| if ($pass eq 1) { |
| next if (1 >= $NAME_CODE_TO_SCRIPTS{$name}{count}); |
| } else { |
| next if (1 < $NAME_CODE_TO_SCRIPTS{$name}{count}); |
| } |
| print "$name:"; |
| my $href = $NAME_CODE_TO_SCRIPTS{$name}; |
| foreach my $code (sort {$a <=> $b} keys %$href) { |
| next if ($code eq 'count'); |
| my $aref = $href->{$code}; |
| print " ", hex2($code), " (", formatScriptList($aref), ")"; |
| } |
| print "\n"; |
| } |
| } |
| } |
| |
| # Create some transliterators, based on the scripts and the %NAME_CODE_TO_SCRIPTS |
| # map. Only use %NAME_CODE_TO_SCRIPTS entries with a count of 2 or more, that is, |
| # names that occur in two or more scripts. For those scripts where |
| # the names occur, map both up to the InterIndic range, and down to |
| # the target script. |
| |
| $INTERINDIC = 0xE000; |
| $INTERINDIC_EXTRA = 0xE080; |
| $INTERINDIC_EXTRA_NEXT = $INTERINDIC_EXTRA; |
| |
| # For each script, create a hash. The hash has a key for each |
| # code point, either within its block, or in the InterIndic block. |
| # the value of the key is the mapping. |
| |
| # The script hashes are named %DEVANAGARI, etc., and referenced |
| # with symbolic refs. |
| |
| @REMAP = ('s/\bSHORT\s+//i', |
| 's/\bCANDRA\s+//i', |
| 's/\bQA$/KA/i', |
| 's/\bKHHA$/KHA/i', |
| 's/\bGHHA$/GA/i', |
| 's/\bZA$/JA/i', |
| 's/\bFA$/PHA/i', |
| 's/\bVA$/BA/i', |
| 's/\bNNNA$/NA/i', |
| 's/\bRRA$/RA/i', |
| 's/\bLLLA$/LLA/i', |
| 's/\bLLLA$/LA/i', |
| 's/\bLLA$/LA/i', |
| 's/^A(.) LENGTH MARK$/VOWEL SIGN A$1/i', |
| 's/CANDRABINDU/BINDI/i', |
| 's/BINDI/CANDRABINDU/i', |
| ); |
| |
| # Do this so we see zero counts: |
| foreach my $remap (@REMAP) { $REMAP{$remap} = 0; } |
| |
| # This loop iterates over the names in the NAME_CODE_TO_SCRIPTS hash. |
| # These names are things like "LETTER NNNA". For each name, it then |
| # creates script mappings up to the InterIndic area, and back down |
| # to the script areas. If a name maps to more than one offset, |
| # then it uses the InterIndic extra range. Either way, it picks |
| # a single InterIndic point, either an offset point or something in |
| # the extra range, and maps up and down from that point. |
| foreach my $name (sort keys %NAME_CODE_TO_SCRIPTS) { |
| next if (1 >= $NAME_CODE_TO_SCRIPTS{$name}{count}); |
| my $href = $NAME_CODE_TO_SCRIPTS{$name}; |
| # Count the number of different codes assigned to this name. |
| # Usually 1, but 2 for a handful of names. |
| my $codeCount = (keys %{$NAME_CODE_TO_SCRIPTS{$name}}) - 1; # less 1: {count} |
| # If $codeCount is 1, then map directly up to the $INTERINDIC |
| # base. If $codeCount is 2, then map into unused spots starting |
| # at $INTERINDIC_EXTRA. |
| my $interIndicCode; |
| if ($codeCount > 1) { |
| # Map into the InterIndic extra range |
| $interIndicCode = $INTERINDIC_EXTRA_NEXT++; |
| } |
| my %seen; |
| foreach my $code (sort {$a ne 'count' && $b ne 'count' && $a <=> $b} keys %$href) { |
| next if ($code eq 'count'); |
| my $aref = $href->{$code}; # Ref to array of scripts |
| if ($codeCount == 1) { |
| # Map directly |
| $interIndicCode = $INTERINDIC + $code; |
| } |
| # Keep track of the names of the extra InterIndic points |
| $INTERINDIC_NAME_TO_FULLCODE{$name} = $interIndicCode; |
| |
| foreach my $scr (@$aref) { |
| $seen{$scr} = 1; |
| my $fullCode = $SCRIPT_TO_BASE{$scr} + $code; |
| $ {$scr}{$fullCode} = hex4($interIndicCode) . "; // $name"; |
| $ {$scr}{$interIndicCode} = hex4($fullCode) . "; // $name"; |
| } |
| } |
| # Now handle InterIndic->Script unmapped points. For each name, |
| # some of the scripts will be left out -- will have no mappings |
| # to that name. For these scripts, we can either leave them |
| # unmapped (so the InterIndic->Local mapping is empty), or |
| # try to remap. |
| unmappedScript: |
| foreach my $scr (keys %SCRIPT_TO_BASE) { |
| next if ($seen{$scr}); |
| |
| if ($DO_HEURISTIC_REMAP) { |
| # Try to remap through the known equivalences in our |
| # remapping table |
| foreach my $remapRE (@REMAP) { |
| local $_ = $name; |
| if (eval($remapRE)) { |
| if (exists $SCRIPT_NAME_TO_CODE{$scr}{$_}) { |
| $ {$scr}{$interIndicCode} = |
| hex4($SCRIPT_TO_BASE{$scr} + $SCRIPT_NAME_TO_CODE{$scr}{$_}) . |
| "; // REMAP: $name -> $_"; |
| ++$REMAP{$remapRE}; |
| next unmappedScript; |
| } |
| } |
| } |
| } |
| |
| # Try to remap through the file. This contains remappings of |
| # the form 0991->0993. That is, it contains local remappings |
| # that we can substitute and try again with. |
| #|GURMUKHI-InterIndic ------------------------------ |
| #|// 0A02>; // UNMAPPED INTERNAL: SIGN BINDI |
| #|InterIndic-GURMUKHI ------------------------------ |
| #|// E001>; // UNMAPPED EXTERNAL: SIGN CANDRABINDU |
| #|"\u0A01>\u0A02;" |
| # In this example, the remapping tells us that the non-existent |
| # character A01 should be considered equivalent to the real |
| # character A02. |
| # We implement this by adding two mappings; one from |
| # the InterIndic equivalent of A01, that is, E001, to A02, |
| # and one from A02, which otherwise has no mapping, to E001. |
| if ($EXCEPTIONS_FILE && $interIndicCode < $INTERINDIC_EXTRA) { |
| # Try to map this InterIndic character back to a the spot |
| # it would occupy in this script if it had a mapping. |
| my $code = $interIndicCode & 0x7F; |
| my $pseudoFullCode = $SCRIPT_TO_BASE{$scr} + $code; |
| if (exists $EXCEPTIONS{$pseudoFullCode}) { |
| my $fullCodeArray = $EXCEPTIONS{$pseudoFullCode}; |
| my $comment; |
| foreach my $c (@$fullCodeArray) { |
| $comment .= "." if ($comment); |
| $comment .= $FULLCODE_TO_NAME{$c}; |
| } |
| $comment = "; // REMAP ($EXCEPTIONS_FILE): " . |
| hex4($pseudoFullCode) . ">" . hexArray(@$fullCodeArray) . " = " . |
| $CODE_TO_NAME{$code} . ">" . $comment; |
| $ {$scr}{$interIndicCode} = hexArray(@$fullCodeArray) . $comment; |
| if (scalar @$fullCodeArray == 1) { |
| if (exists $ {$scr}{$fullCodeArray->[0]}) { |
| # There's already a proper mapping; no need to fill |
| # in reverse |
| } else { |
| $ {$scr}{$fullCodeArray->[0]} = hex4($interIndicCode) . $comment; |
| } |
| } |
| next unmappedScript; |
| } |
| } |
| |
| $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$interIndicCode} = 1; |
| local $_ = "; // UNMAPPED InterIndic-$scr: $name"; |
| if (exists $SCRIPT_CODE_TO_NAME{$scr}{$interIndicCode & 0x7F}) { |
| my $fullCode = $SCRIPT_TO_BASE{$scr} + ($interIndicCode & 0x7F); |
| $_ .= " (" . hex4($fullCode) . " = " . $FULLCODE_TO_NAME{$fullCode} . ")"; |
| } |
| $ {$scr}{$interIndicCode} = $_; |
| } |
| } |
| |
| # Add in unmapped entries for each script |
| foreach my $scr (keys %SCRIPT_TO_BASE) { |
| my $base = $SCRIPT_TO_BASE{$scr}; |
| unmappedInt: |
| foreach my $code (keys %{$SCRIPT_CODE_TO_NAME{$scr}}) { |
| my $fullCode = $code + $base; |
| next if (exists $ {$scr}{$fullCode}); |
| my $name = $SCRIPT_CODE_TO_NAME{$scr}{$code}; |
| |
| if ($DO_HEURISTIC_REMAP) { |
| foreach my $remapRE (@REMAP) { |
| local $_ = $name; |
| if (eval($remapRE)) { |
| if (exists $INTERINDIC_NAME_TO_FULLCODE{$_}) { |
| $ {$scr}{$fullCode} = |
| hex4($INTERINDIC_NAME_TO_FULLCODE{$_}) . |
| "; // REMAP: $name -> $_"; |
| ++$REMAP{$remapRE}; |
| next unmappedInt; |
| } |
| } |
| } |
| } |
| |
| # Now try the decomp table |
| if ($DO_DECOMP_REMAP && exists $DECOMP{$name}) { |
| my $x; |
| my $cmt = "; // DECOMP: $name -> "; |
| foreach my $n (@{$DECOMP{$name}}) { |
| if (exists $SCRIPT_NAME_TO_CODE{$scr}{$n}) { |
| $x .= hex4($SCRIPT_TO_BASE{$scr} + $SCRIPT_NAME_TO_CODE{$scr}{$n}); |
| $cmt .= $n . " + "; |
| } else { |
| $cmt = 0; |
| last; |
| } |
| } |
| if ($cmt) { |
| $ {$scr}{$fullCode} = $x . $cmt; |
| next unmappedInt; |
| } |
| } |
| |
| $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode} = 1; |
| $ {$scr}{$fullCode} = "; // UNMAPPED $scr-InterIndic: $name"; |
| } |
| } |
| |
| # GUR |
| # E00B>; // UNMAPPED EXTERNAL: LETTER VOCALIC R "\u0A0B>\u0A30\u0A3F;" |
| # E00C>; // UNMAPPED EXTERNAL: LETTER VOCALIC L "\u0A0C>\u0A07;" |
| # E00D>; // UNMAPPED EXTERNAL: LETTER CANDRA E "\u0A0D>\u0A10;" |
| # E011>; // UNMAPPED EXTERNAL: LETTER CANDRA O "\u0A11>\u0A14;" |
| # E037>; // UNMAPPED EXTERNAL: LETTER SSA "\u0A37>\u0A36;" |
| # E045>; // UNMAPPED EXTERNAL: VOWEL SIGN CANDRA E "\u0A45>\u0A48;" |
| # E049>; // UNMAPPED EXTERNAL: VOWEL SIGN CANDRA O "\u0A49>\u0A4C;" |
| # Fix QA too |
| |
| # Dump out script maps |
| foreach my $scr (sort keys %SCRIPT_TO_BASE) { |
| ## next unless ($scr eq 'TELUGU'); # Debugging |
| my @rules; |
| my $flag = 1; |
| foreach my $fullCode (sort {$a <=> $b} keys %{$scr}) { |
| if ($flag && $fullCode >= $INTERINDIC) { |
| # We have the complete <scr>-InterIndic rules; dump |
| # them out. |
| generateRBT($scr, "InterIndic", \@rules, $OUTDIR); |
| @rules = (); |
| $flag = 0; |
| } |
| if (exists $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode}) { |
| push @rules, "// " . hex4($fullCode) . ">" . $ {$scr}{$fullCode}; |
| } else { |
| push @rules, hex4($fullCode) . ">" . $ {$scr}{$fullCode}; |
| } |
| } |
| # Now generate the InterIndic-<scr> rules. |
| generateRBT("InterIndic", $scr, \@rules, $OUTDIR); |
| |
| # print "$scr-InterIndic ------------------------------\n"; |
| # my $flag = 1; |
| # foreach my $fullCode (sort {$a <=> $b} keys %{$scr}) { |
| # if ($flag && $fullCode >= $INTERINDIC) { |
| # print "InterIndic-$scr ------------------------------\n"; |
| # $flag = 0; |
| # } |
| # if (exists $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode}) { |
| # print "// ", hex4($fullCode), ">", $ {$scr}{$fullCode}, "\n"; |
| # } else { |
| # print hex4($fullCode), ">", $ {$scr}{$fullCode}, "\n"; |
| # } |
| # } |
| } |
| |
| # List successful remappings |
| if ($DO_HEURISTIC_REMAP) { |
| foreach my $remap (sort keys %REMAP) { |
| print STDERR "REMAP ", $REMAP{$remap}, " x $remap\n"; |
| } |
| } |
| |
| #---------------------------------------------------------------------- |
| # SUBROUTINES |
| |
| # Return a listing of an array of scripts |
| # Param: array ref |
| sub formatScriptList { |
| my $aref = shift; |
| if ($SCRIPT_COUNT == @$aref) { |
| return "all"; |
| } elsif (($SCRIPT_COUNT - 3) <= @$aref) { |
| my $s = "all but"; |
| my %temp; |
| foreach (@$aref) { $temp{$_} = 1; } |
| foreach (sort keys %SCRIPT_TO_BASE) { |
| $s .= " $_" unless exists $temp{$_}; |
| } |
| return $s; |
| } else { |
| return join(" ", @$aref); |
| } |
| } |
| |
| # Format as %02X hex |
| sub hex2 { |
| sprintf("%02X", $_[0]); |
| } |
| |
| # Format as %04X hex |
| sub hex4 { |
| sprintf("\\u%04X", $_[0]); |
| } |
| |
| # Format an array as %04X hex, delimited by "."s |
| sub hexArray { |
| join("", map { hex4($_); } @_); |
| } |
| |
| # Parse a string of the form "\u0D01" to an array of integers. |
| # Must ONLY contain escapes. |
| # Return the array. |
| sub parseUnicodeEscape { |
| local $_ = shift; |
| my $orig = $_; |
| my @result; |
| while (length($_)) { |
| if (/^\\u([0-9a-f]{4})(.*)/i) { |
| push @result, hex($1); |
| $_ = $2; |
| } else { |
| die "Can't parse Unicode escape $orig\n"; |
| } |
| } |
| if (0 == @result) { |
| die "Can't parse Unicode escape $orig\n"; |
| } |
| @result; |
| } |
| |
| # Return 1 if the two arrays of ints are equal. |
| # Param: ref to array of ints |
| # Param: ref to array of ints |
| sub intArraysEqual { |
| my $a = shift; |
| my $b = shift; |
| if (scalar @$a == scalar @$b) { |
| for (my $i=0; $i<@$a; ++$i) { |
| if ($a->[$i] != $b->[$i]) { |
| return 0; |
| } |
| } |
| return 1; |
| } |
| return 0; |
| } |
| |
| # Given a rule, possibly with trailing // comment, |
| # quote the rule part and add a trailing "+" after |
| # it. |
| sub quoteRule { |
| my $cmt; |
| $cmt = $1 if (s|(\s*//.*)||); # isolate trailing // comment |
| s/^(.*;)/\"$1\"+/; |
| s/$/$cmt/; |
| $_; |
| } |
| |
| # Given the name of the source script, name of the target script, |
| # and array of rule strings, return a string containing the source |
| # for a RuleBasedTransliterator file. |
| # Param: source script name |
| # Param: target script name |
| # Param: ref to array of rules. These rules are unquoted, without |
| # concatenators between them, but do have trailing ';' separators. |
| # Param: name of output directory |
| sub generateRBT { |
| # $TOOL - name of generating tool |
| # $DATE - date of generation |
| # $SCRIPTFROM - name of source script |
| # $SCRIPTTO - name of target script |
| # $RULES - rules |
| my ($source, $target, $rules, $outdir) = @_; |
| my $text; |
| $outdir =~ s|[/\\]$||; # Delete trailing / or \ |
| my $OUT = "$outdir/TransliterationRule_${source}_$target.java"; |
| open(RBT_TEMPLATE) or die; |
| open(OUT, ">$OUT") or die; |
| while (<RBT_TEMPLATE>) { |
| while (/\$([A-Za-z0-9]+)/) { |
| my $tag = $1; |
| my $sub; |
| if ($tag eq 'TOOL') { |
| $sub = $RBT_GEN_TOOL; |
| } elsif ($tag eq 'DATE') { |
| $sub = localtime; |
| } elsif ($tag eq 'SCRIPTFROM') { |
| $sub = $source; |
| } elsif ($tag eq 'SCRIPTTO') { |
| $sub = $target; |
| } elsif ($tag eq 'RULES') { |
| # Get any whitespace-only indent off the front of this tag |
| my $indent; |
| $indent = $1 if (/^(\s+)\$$tag/); |
| |
| # The rules in the array are not quoted. We need to quote |
| # them and add '+' operators between them. We do NOT need |
| # to add ';' separators. We DO need to separate trailing |
| # // comments and handle them. |
| $sub = join("\n$indent", map("eRule, @$rules)) . |
| "\n$indent\"\""; |
| } else { |
| print STDERR "ERROR in $RBT_TEMPLATE: Unknown tag $tag\n"; |
| $sub = "[ERROR:Unknown tag \$$tag]"; |
| } |
| s/\$$tag/$sub/; |
| } |
| print OUT; |
| } |
| close(OUT); |
| close(RBT_TEMPLATE); |
| print STDERR "Written: $OUT\n"; |
| } |
| |
| __END__ |