| @rem = '--*-Perl-*-- |
| @echo off |
| if "%OS%" == "Windows_NT" goto WinNT |
| perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9 |
| goto endofperl |
| :WinNT |
| perl -x -S "%0" %* |
| if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl |
| if %errorlevel% == 9009 echo You do not have Perl in your PATH. |
| goto endofperl |
| @rem '; |
| #!perl |
| #line 14 |
| # ************************************************************************ |
| # Copyright (C) 2000-2004, International Business Machines Corporation and |
| # others. All Rights Reserved. |
| # ************************************************************************ |
| |
| # This perl script creates ICU transliterator data files, that live |
| # in icu/data, from ICU4J UTF8 transliterator data files, in |
| # icu4j/src/com/ibm/icu/impl/data/. |
| # |
| # The transformation that is done is very minimal. The script assumes |
| # that the input files use only # comments |
| # and that they follow a rigid format. |
| # |
| # The output files are named according to ICU conventions (see NAME_MAP |
| # below) and created in the current directory. They should be manually |
| # checked and then copied into the icu/data/tranlit directory. |
| # An ICU build must |
| # then be initiated, and the standard suite of ICU transliterator tests |
| # should be run after that. |
| # |
| # Alan Liu 5/19/00 2/27/01 |
| |
| use Getopt::Long; |
| use strict; |
| |
| use vars qw(%USED_FILES); |
| |
| my $DIR = "../../../impl/data"; |
| my $ID = ''; |
| |
| GetOptions('dir=s' => \$DIR, |
| 'id=s' => \$ID, |
| '<>' => \&usage) || die; |
| |
| usage() if (@ARGV); |
| |
| my $ID =~ s/-/_/; |
| if (! -d $DIR) { |
| print STDERR "$DIR is not a directory\n"; |
| usage(); |
| } |
| |
| sub usage { |
| my $me = $0; |
| $me =~ s|.+[/\\]||; |
| print "Usage: $me [-dir <dir>] [-id <id>]\n"; |
| print " --dir <dir> Specify the directory containing the\n"; |
| print " Transliterator_*.txt files\n"; |
| print " --id <id> Specify a single ID to transform, e.g.\n"; |
| print " Fullwidth-Halfwidth\n"; |
| exit(1); |
| } |
| |
| my $JAVA_ONLY = '-'; |
| |
| my $OUTDIR = "icu4c"; |
| mkdir($OUTDIR,0777); |
| |
| # Mapping from Java file names to ICU file names |
| # Constraints on ICU4C file name: icudt20b_ |
| # |--9 (prefix)---|---18(name with distinguisher,e.g. "t_" )----| |
| # --4 ("."+extn)--| = 31 characters total. |
| # That is, must have length(%NAME_MAP{x}) <= 16 |
| |
| my $MAX_ICU4C_FILENAME_LEN = 18; |
| |
| # -- HISTORY -- If not marked, then pre 2.2. |
| # All InterIndic are pre 2.2. |
| # Any_Accents |
| # Any_Publishing |
| # Arabic_Latin * 2.2 |
| # Cyrillic_Latin |
| # Fullwidth_Halfwidth |
| # Greek_Latin |
| # Greek_Latin_UNGEGN * 2.2 (moved from el.txt) |
| # Han_Latin * 2.2 |
| # Han_Latin_Definition * 2.2 |
| # Han_Latin_EDICT * 2.2 J only |
| # Hebrew_Latin * 2.2 |
| # Hiragana_Katakana |
| # Hiragana_Latin |
| # Latin_Jamo |
| # Latin_Katakana |
| # ThaiLogical_Latin * 2.2 J only |
| # Thai_ThaiLogical * 2.2 J only |
| # Thai_ThaiSemi * 2.2 J only |
| |
| my %NAME_MAP = ( |
| # An ICU name of "" means the ICU name == the ID |
| |
| # We filter names based on what is in use in the index file. |
| |
| # Flag a rule as JAVA_ONLY if it exists and we use it in Java, |
| # but we don't use it in C. |
| |
| # Use official script abbreviations where possible. |
| |
| # |..............| |..............| |
| # 1234567890123456 1234567890123456 |
| Any_Accents => "", |
| Any_Publishing => "", |
| Cyrillic_Latin => "Cyrl_Latn", |
| Fullwidth_Halfwidth => "FWidth_HWidth", |
| Greek_Latin => "Grek_Latn", |
| Hiragana_Katakana => "Hira_Kana", |
| Hiragana_Latin => "Hira_Latn", |
| Latin_Jamo => "Latn_Jamo", |
| Latin_Katakana => "Latn_Kana", |
| |
| Arabic_Latin => "Arab_Latn", |
| Greek_Latin_UNGEGN => "Grek_Latn_UNGEGN", |
| Han_Latin => "Hani_Latn", |
| Han_Latin_Definition => "Hani_Latn_Def", |
| Han_Latin_EDICT => "Hani_Latn_EDICT", |
| Hebrew_Latin => "Hebr_Latn", |
| ThaiLogical_Latin => $JAVA_ONLY, # "ThaiLog_Latn", |
| Thai_ThaiLogical => $JAVA_ONLY, # "Thai_ThaiLog", |
| Thai_ThaiSemi => $JAVA_ONLY, # "Thai_ThaiSemi", |
| |
| InterIndic_Bengali => "InterIndic_Beng", |
| InterIndic_Devanagari => "InterIndic_Deva", |
| InterIndic_Gujarati => "InterIndic_Gujr", |
| InterIndic_Gurmukhi => "InterIndic_Guru", |
| InterIndic_Kannada => "InterIndic_Knda", |
| InterIndic_Latin => "InterIndic_Latn", |
| InterIndic_Malayalam => "InterIndic_Mlym", |
| InterIndic_Oriya => "InterIndic_Orya", |
| InterIndic_Tamil => "InterIndic_Taml", |
| InterIndic_Telugu => "InterIndic_Telu", |
| |
| Bengali_InterIndic => "Beng_InterIndic", |
| Devanagari_InterIndic => "Deva_InterIndic", |
| Gujarati_InterIndic => "Gujr_InterIndic", |
| Gurmukhi_InterIndic => "Guru_InterIndic", |
| Kannada_InterIndic => "Knda_InterIndic", |
| Latin_InterIndic => "Latn_InterIndic", |
| Malayalam_InterIndic => "Mlym_InterIndic", |
| Oriya_InterIndic => "Orya_InterIndic", |
| Tamil_InterIndic => "Taml_InterIndic", |
| Telugu_InterIndic => "Telu_InterIndic", |
| |
| Han_Pinyin => $JAVA_ONLY, |
| Kanji_English => $JAVA_ONLY, |
| Kanji_OnRomaji => $JAVA_ONLY, |
| |
| Latin_NumericPinyin => "Latn_NPinyn", |
| Tone_Digit => "Tone_Digit", |
| Han_Spacedhan => "Hani_SpHan", |
| ); |
| |
| my ($x,$x,$x,$x,$x,$THIS_YEAR) = localtime(); |
| $THIS_YEAR += 1900; |
| |
| # Header blocks of text written at start of ICU output files |
| my $HEADER1 = <<END; |
| //-------------------------------------------------------------------- |
| // Copyright (c) 1999-$THIS_YEAR, International Business Machines |
| // Corporation and others. All Rights Reserved. |
| //-------------------------------------------------------------------- |
| // THIS IS A MACHINE-GENERATED FILE |
| END |
| my $HEADER2 = <<END; |
| //-------------------------------------------------------------------- |
| END |
| |
| my $TOOL = $0; |
| |
| # Convert the index first; this tells us which rule files are in use. |
| convertIndex(); |
| |
| # print "In use:\n", join("\n", sort keys(%USED_FILES)), "\n"; |
| |
| # Iterate over all Java RBT rule files |
| foreach (<$DIR/Transliterator_*.txt>) { |
| next if (/~$/); # Ignore emacs backups |
| next if (/_index\.txt$/); # The index file was processed above |
| # Select either the command-line arg, if there was one, or |
| # any files mentioned in the index. |
| my $leaf = $_; |
| $leaf =~ s|.+[/\\]||; |
| if (($ID && $leaf =~ /$ID/) || exists $USED_FILES{$leaf}) { |
| my ($out, $id) = convertFileName($_); |
| if ($out) { |
| if ($out eq $JAVA_ONLY) { |
| print STDERR "*** $id skipped: Java only ***\n"; |
| next; |
| } |
| file($id, $_, $out); |
| } |
| } elsif (!$ID) { |
| print "*** $leaf skipped: not in use ***\n"; |
| } |
| } |
| |
| ###################################################################### |
| # Convert a Java file name to C |
| # Param: Java file name of the form m|Transliterator_(.+)\.utf8\.txt$| |
| # Return: A C file name (e.g., ldevan.txt) or the empty string, |
| # if there is no mapping, or $JAVA_ONLY if the given file isn't |
| # intended to be incorporated into C. |
| sub convertFileName { |
| local $_ = shift; |
| my $id; |
| if (m|Transliterator_(.+)\.utf8\.txt$| || |
| m|Transliterator_(.+)\.txt$|) { |
| $id = $1; |
| } else { die "Can't parse Java file name $_"; } |
| if (!exists $NAME_MAP{$id}) { |
| print STDERR "ERROR: $id not in map; please update $0\n"; |
| return ''; |
| } |
| my $out = $NAME_MAP{$id}; |
| if ($out eq '') { |
| $out = $id; |
| } |
| if ($out ne $JAVA_ONLY) { |
| $out = 't_' . $out; |
| } |
| if (length($out) > $MAX_ICU4C_FILENAME_LEN) { |
| print STDERR "ERROR: ICU4C file name \"$out\" too long; please update $0\n"; |
| return ''; |
| } |
| return ($out, $id); |
| } |
| |
| ###################################################################### |
| # Convert the index file from Java to C format |
| sub convertIndex { |
| my $JAVA_INDEX = "Transliterator_index.txt"; |
| my $C_INDEX = "translit_index.txt"; |
| open(JAVA_INDEX, "$DIR/$JAVA_INDEX") or die; |
| open(C_INDEX, ">$OUTDIR/$C_INDEX") or die; |
| |
| header(\*C_INDEX, $JAVA_INDEX); |
| |
| print C_INDEX <<END; |
| //-------------------------------------------------------------------- |
| // N.B.: This file has been generated mechanically from the |
| // corresponding ICU4J file, which is the master file that receives |
| // primary updates. The colon-delimited fields have been split into |
| // separate strings. For 'file' and 'internal' lines, the encoding |
| // field has been deleted, since the encoding is processed at build |
| // time in ICU4C. Certain large rule sets not intended for general |
| // use have been commented out with the notation "Java only". |
| //-------------------------------------------------------------------- |
| |
| translit_index { |
| RuleBasedTransliteratorIDs { |
| END |
| |
| while (<JAVA_INDEX>) { |
| # ignore CVS keyword substitutions |
| next if /\$(Source|Revision|Date)/; |
| |
| # we have printed out the copyright info ... ignore one in Java version |
| next if /Copyright/ ; |
| next if /Corporation/; |
| |
| # Comments; change # to // |
| if (s|^(\s*)\#|$1//|) { |
| print C_INDEX; |
| next; |
| } |
| # Blank lines |
| if (!/\S/) { |
| print C_INDEX; |
| next; |
| } |
| # Content lines |
| chomp; |
| my $prefix = ''; |
| #replace \p with \\p |
| $_=~ s/\\p/\\\\p/g; |
| my @a = split(':', $_); |
| if ($a[1] eq 'file' || $a[1] eq 'internal') { |
| # Convert the file name |
| my $id; |
| # Record file names in use |
| $USED_FILES{$a[2]} = 1; |
| ($a[2], $id) = convertFileName($a[2]); |
| if ($a[2] eq $JAVA_ONLY) { |
| $prefix = '// Java only: '; |
| } |
| |
| # Delete the encoding field |
| splice(@a, 3, 1); |
| } elsif ($a[1] eq 'alias') { |
| # Pad out with extra blank fields to make the |
| # 2-d array square |
| push @a, ""; |
| } else { |
| die "Can't parse $_"; |
| } |
| print C_INDEX |
| $prefix, "{ ", |
| join(", ", map("\"$_\"", @a)), |
| " },\n"; |
| } |
| |
| print C_INDEX <<END; |
| } |
| } |
| END |
| |
| close(C_INDEX); |
| close(JAVA_INDEX); |
| print STDERR "$JAVA_INDEX -> $C_INDEX\n"; |
| } |
| |
| ###################################################################### |
| # Output a header |
| # Param: Filehandle |
| sub header { |
| my $out = shift; |
| my $in = shift; |
| print $out $HEADER1; |
| print $out "// Tool: $TOOL\n// Source: $in\n"; |
| print $out "// Date: ", scalar localtime, "\n"; |
| print $out $HEADER2; |
| print $out "\n"; |
| } |
| |
| ###################################################################### |
| # Process one file |
| # Param: ID, e.g. Fullwidth-Halfwidth |
| # Param: Java input file name, e.g. |
| # f:/icu4j/src/com/ibm/text/resources/Transliterator_Fullwidth_Halfwidth.utf8.txt |
| # Param: ICU output file name, e.g. fullhalf |
| sub file { |
| my $id = shift; |
| my $IN = shift; |
| my $out = shift; |
| |
| my $OUT = "$out.txt"; |
| |
| # Show input size. Show output size later -- useful for quick sanity check. |
| print "$id (", -s $IN, ") -> $OUT ("; |
| |
| # Open file, write UTF8 marker, close it, and reopen in text mode |
| open(OUT, ">$OUTDIR/$OUT") or die; |
| binmode OUT; # Must do this so we can write our UTF8 marker |
| print OUT pack("C3", 0xEF, 0xBB, 0xBF); # Write UTF8 marker |
| close(OUT); |
| |
| open(OUT, ">>$OUTDIR/$OUT") or die; |
| print OUT " // -*- Coding: utf-8; -*-\n"; |
| |
| header(\*OUT, $IN); |
| print OUT "// $id\n"; |
| print OUT "\n"; |
| print OUT "$out {\n"; |
| print OUT " Rule {\n"; |
| |
| open(IN, $IN) or die; |
| binmode IN; # IN is a UTF8 file |
| |
| my $first = 1; |
| my $BOM = pack("C3", 239, 187, 191); # a UTF8 byte order mark |
| |
| # Process each line by changing # comments to // comments |
| # and taking other text and enclosing it in double quotes |
| while (<IN>) { |
| my $raw = $_; |
| # ignore CVS keyword substitutions |
| next if /\$(Source|Revision|Date)/; |
| |
| # we have printed out the copyright info ... ignore one in Java version |
| next if /Copyright/ ; |
| next if /Corporation/; |
| |
| # Look for and delete BOM |
| if ($first) { |
| s/^$BOM//; |
| $first = 0; |
| } |
| |
| # Clean the eol junk up |
| s/[\x0D\x0A]+$//; |
| |
| # If there is a trailing backslash, then delete it -- we don't |
| # need line continuation in C, since adjacent strings are |
| # concatenated. Count trailing backslashes; if they are odd, |
| # one is trailing. |
| if (m|(\\+)$|) { |
| if ((length($1) % 2) == 1) { |
| s|\\$||; |
| } |
| } |
| |
| # Transform escaped characters |
| hideEscapes(); |
| |
| if (/^(\s*)(\#.*)$/) { |
| # Comment-only line |
| my ($white, $cmt) = ($1, $2); |
| $cmt =~ s|\#|//|; |
| $_ = $white . $cmt; |
| |
| } elsif (!/\S/) { |
| # Blank line -- leave as-is |
| |
| } else { |
| # Remove single-quoted matter |
| my @quotes; |
| my $nquotes = 0; |
| my $x = $_; |
| while (s/^([^\']*)(\'[^\']*\')/$1<<x$nquotes>>/) { |
| push @quotes, $2; |
| ++$nquotes; |
| } |
| |
| # Extract comment |
| my $cmt = ''; |
| if (s|\#(.*)||) { |
| $cmt = '//' . $1; |
| } |
| |
| # Add quotes |
| s|^(\s*)(\S.*?)(\s*)$|$1\"$2\"$3|; |
| |
| # Restore single-quoted matter |
| for (my $i=0; $i<$nquotes; ++$i) { |
| s|<<x$i>>|$quotes[$i]|; |
| } |
| |
| # Restore comment |
| $_ .= $cmt; |
| } |
| |
| # Restore escaped characters |
| restoreEscapes(); |
| |
| print OUT $_, "\n"; |
| } |
| |
| # Finish up |
| close(IN); |
| print OUT " }\n"; |
| print OUT "}\n"; |
| close(OUT); |
| |
| # Write output file size for sanity check |
| print -s "$OUTDIR/$OUT", ")\n"; |
| } |
| |
| ###################################################################### |
| sub hideEscapes { |
| # Transform escaped characters |
| s|\\\\|<<bs>>|g; # DO THIS FIRST Transform backslashes |
| s|\\u([a-zA-Z0-9]{4})|<<u$1>>|g; # Transform Unicode escapes |
| s|\\\"|<<dq>>|g; # Transform backslash double quote |
| s|\\\'|<<sq>>|g; # Transform backslash single quote |
| s|\\\#|<<lb>>|g; # Transform backslash pound |
| s|\\(.)|<<q$1>>|g; # Transform backslash escapes |
| } |
| |
| ###################################################################### |
| sub restoreEscapes { |
| # Restore escaped characters |
| s|<<bs>>|\\\\|g; |
| s|<<dq>>|\\\\\\\"|g; |
| s|<<sq>>|\\\\\\\'|g; |
| s|<<lb>>|\\\\\\\#|g; |
| s|<<q(.)>>|\\\\\\$1|g; |
| s|<<u0000>>|\\\\u0000|g; # Double escape U+0000 |
| s|<<u(....)>>|\\u$1|g; |
| } |
| |
| __END__ |
| :endofperl |