blob: 1f8aafbee7495c0b7b5ce07e203a7ceb24263e4f [file] [log] [blame]
@rem = '--*-Perl-*--
@echo off
if "%OS%" == "Windows_NT" goto WinNT
perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9
goto endofperl
perl -x -S "%0" %*
if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl
if %errorlevel% == 9009 echo You do not have Perl in your PATH.
goto endofperl
@rem ';
#line 14
# This perl script creates ICU transliterator data files, that live
# in icu/data, from ICU4J UTF8 transliterator data files, in
# icu4j/src/com/ibm/text/resources.
# The transformation that is done is very minimal. The script assumes
# that the input files use only # comments
# and that they follow a rigid format.
# The output files are named according to ICU conventions (see NAME_MAP
# below) and created in the current directory. They should be manually
# checked and then copied into the icu/data directory. An ICU build must
# then be initiated, and the standard suite of ICU transliterator tests
# should be run after that.
# Alan Liu 5/19/00 2/27/01
$DIR = shift || "../../text/resources";
if (! -d $DIR) {
print STDERR "$DIR is not a directory\n";
$ID = shift;
$ID =~ s/-/_/;
sub usage {
my $me = $0;
$me =~ s|.+[/\\]||;
print "Usage: $me <dir> [<id>]\n";
print " where <dir> contains the Transliterator_*.utf8.txt\n";
print " files.\n";
print "e.g., $me F:/icu4j/src/com/ibm/text/resources\n";
print " optional <id> specifies single ID to transform, e.g.\n";
print " Fullwidth-Halfwidth\n";
$JAVA_ONLY = '-';
$OUTDIR = "icu4c";
# Mapping from Java file names to ICU file names
# An ICU name of "" means the ICU name == the ID
"Any_Accents" => "",
"Any_Publishing" => "",
"Bengali_InterIndic" => "",
"Cyrillic_Latin" => "",
"Devanagari_InterIndic" => "",
"Fullwidth_Halfwidth" => "",
"Greek_Latin" => "",
"Gujarati_InterIndic" => "",
"Gurmukhi_InterIndic" => "",
"Hiragana_Katakana" => "",
"Hiragana_Latin" => "",
"InterIndic_Bengali" => "",
"InterIndic_Devanagari" => "",
"InterIndic_Gujarati" => "",
"InterIndic_Gurmukhi" => "",
"InterIndic_Kannada" => "",
"InterIndic_Latin" => "",
"InterIndic_Malayalam" => "",
"InterIndic_Oriya" => "",
"InterIndic_Tamil" => "",
"InterIndic_Telugu" => "",
"Kannada_InterIndic" => "",
"Latin_InterIndic" => "",
"Latin_Jamo" => "",
"Latin_Katakana" => "",
"Malayalam_InterIndic" => "",
"Oriya_InterIndic" => "",
"Tamil_InterIndic" => "",
"Telugu_InterIndic" => "",
"Han_Pinyin" => $JAVA_ONLY,
"Kanji_English" => $JAVA_ONLY,
"Kanji_OnRomaji" => $JAVA_ONLY,
# "Fullwidth_Halfwidth" => "fullhalf",
# "Hiragana_Katakana" => "kana",
# "KeyboardEscape_Latin1" => "kbdescl1",
# "Latin_Arabic" => "larabic",
# "Latin_Cyrillic" => "lcyril",
# "Latin_Devanagari" => "ldevan",
# "Latin_Greek" => "lgreek",
# "Latin_Hebrew" => "lhebrew",
# "Latin_Jamo" => "ljamo",
# "Latin_Kana" => "lkana",
# "StraightQuotes_CurlyQuotes" => "quotes",
# "UnicodeName_UnicodeChar" => "ucname",
# # An ICU name of "" means the ICU name == the ID
# "Bengali_InterIndic" => "",
# "Devanagari_InterIndic" => "",
# "Gujarati_InterIndic" => "",
# "Gurmukhi_InterIndic" => "",
# "Kannada_InterIndic" => "",
# "Malayalam_InterIndic" => "",
# "Oriya_InterIndic" => "",
# "Tamil_InterIndic" => "",
# "Telugu_InterIndic" => "",
# "InterIndic_Bengali" => "",
# "InterIndic_Devanagari" => "",
# "InterIndic_Gujarati" => "",
# "InterIndic_Gurmukhi" => "",
# "InterIndic_Kannada" => "",
# "InterIndic_Malayalam" => "",
# "InterIndic_Oriya" => "",
# "InterIndic_Tamil" => "",
# "InterIndic_Telugu" => "",
# # These files are large, so ICU doesn't want them
# "Han_Pinyin" => $JAVA_ONLY,
# "Kanji_English" => $JAVA_ONLY,
# "Kanji_OnRomaji" => $JAVA_ONLY,
# );
# Header blocks of text written at start of ICU output files
// Copyright (c) 1999-2001, International Business Machines
// Corporation and others. All Rights Reserved.
$TOOL = $0;
# Iterate over all Java RBT rule files
foreach (<$DIR/Transliterator_*.txt>) {
next if (/~$/);
next if (/_index\.txt$/);
next if ($ID && !/$ID/);
my ($out, $id) = convertFileName($_);
if ($out) {
if ($out eq $JAVA_ONLY) {
print STDERR "$id: Java only\n";
file($id, $_, $out);
# Convert a Java file name to C
# Param: Java file name of the form m|Transliterator_(.+)\.utf8\.txt$|
# Return: A C file name (e.g., ldevan.txt) or the empty string,
# if there is no mapping, or $JAVA_ONLY if the given file isn't
# intended to be incorporated into C.
sub convertFileName {
local $_ = shift;
my $id;
if (m|Transliterator_(.+)\.utf8\.txt$| ||
m|Transliterator_(.+)\.txt$|) {
$id = $1;
} else { die "Can't parse Java file name $_"; }
if (!exists $NAME_MAP{$id}) {
print STDERR "ERROR: $id not in map; please update $0\n";
return '';
my $out = $NAME_MAP{$id};
if ($out eq '') {
$out = $id;
if ($out ne $JAVA_ONLY) {
$out = 'translit_' . $out;
return ($out, $id);
# Convert the index file from Java to C format
sub convertIndex {
$JAVA_INDEX = "Transliterator_index.txt";
$C_INDEX = "translit_index.txt";
open(JAVA_INDEX, "$DIR/$JAVA_INDEX") or die;
open(C_INDEX, ">$OUTDIR/$C_INDEX") or die;
header(\*C_INDEX, $JAVA_INDEX);
print C_INDEX <<END;
// N.B.: This file has been generated mechanically from the
// corresponding ICU4J file, which is the master file that receives
// primary updates. The colon-delimited fields have been split into
// separate strings. For 'file' and 'internal' lines, the encoding
// field has been deleted, since the encoding is processed at build
// time in ICU4C. Certain large rule sets not intended for general
// use have been commented out with the notation "Java only".
translit_index {
RuleBasedTransliteratorIDs {
while (<JAVA_INDEX>) {
# Comments; change # to //
if (s|^(\s*)\#|$1//|) {
print C_INDEX;
# Blank lines
if (!/\S/) {
print C_INDEX;
# Content lines
my $prefix = '';
my @a = split(':', $_);
if ($a[1] eq 'file' || $a[1] eq 'internal') {
# Convert the file name
my $id;
($a[2], $id) = convertFileName($a[2]);
if ($a[2] eq $JAVA_ONLY) {
$prefix = '// Java only: ';
# Delete the encoding field
splice(@a, 3, 1);
} elsif ($a[1] eq 'alias') {
# Pad out with extra blank fields to make the
# 2-d array square
push @a, "";
} else {
die "Can't parse $_";
print C_INDEX
$prefix, "{ ",
join(", ", map("\"$_\"", @a)),
" },\n";
print C_INDEX <<END;
# Output a header
# Param: Filehandle
sub header {
my $out = shift;
my $in = shift;
print $out $HEADER1;
print $out "// Tool: $TOOL\n// Source: $in\n";
print $out "// Date: ", scalar localtime, "\n";
print $out $HEADER2;
print $out "\n";
# Process one file
# Param: ID, e.g. Fullwidth-Halfwidth
# Param: Java input file name, e.g.
# f:/icu4j/src/com/ibm/text/resources/Transliterator_Fullwidth_Halfwidth.utf8.txt
# Param: ICU output file name, e.g. fullhalf
sub file {
my $id = shift;
my $IN = shift;
my $out = shift;
my $OUT = "$out.txt";
# Show input size. Show output size later -- useful for quick sanity check.
print "$id (", -s $IN, ") -> $OUT (";
# Write output file header
open(OUT, ">$OUTDIR/$OUT") or die;
binmode OUT; # Must do this so we can write our UTF8 marker
# Write UTF8 marker
print OUT pack("C3", 0xEF, 0xBB, 0xBF);
print OUT " // -*- Coding: utf-8; -*-\n";
header(\*OUT, $IN);
print OUT "// $id\n";
print OUT "\n";
print OUT "$out {\n";
print OUT " Rule {\n";
open(IN, $IN) or die;
binmode IN; # IN is a UTF8 file
my $first = 1;
my $BOM = pack("C3", 239, 187, 191); # a UTF8 byte order mark
# Process each line by changing # comments to // comments
# and taking other text and enclosing it in double quotes
while (<IN>) {
my $raw = $_;
# Look for and delete BOM
if ($first) {
$first = 0;
# Clean the eol junk up
# If there is a trailing backslash, then delete it -- we don't
# need line continuation in C, since adjacent strings are
# concatenated. Count trailing backslashes; if they are odd,
# one is trailing.
if (m|(\\+)$|) {
if ((length($1) % 2) == 1) {
# Transform escaped characters
if (/^(\s*)(\#.*)$/) {
# Comment-only line
my ($white, $cmt) = ($1, $2);
$cmt =~ s|\#|//|;
$_ = $white . $cmt;
} elsif (!/\S/) {
# Blank line -- leave as-is
} else {
# Remove single-quoted matter
my @quotes;
my $nquotes = 0;
my $x = $_;
while (s/^([^\']*)(\'[^\']*\')/$1<<x$nquotes>>/) {
push @quotes, $2;
# Extract comment
my $cmt = '';
if (s|\#(.*)||) {
$cmt = '//' . $1;
# Add quotes
# Restore single-quoted matter
for (my $i=0; $i<$nquotes; ++$i) {
# Restore comment
$_ .= $cmt;
# Restore escaped characters
print OUT $_, "\n";
# Finish up
print OUT " }\n";
print OUT "}\n";
# Write output file size for sanity check
print -s "$OUTDIR/$OUT", ")\n";
sub hideEscapes {
# Transform escaped characters
s|\\\\|<<bs>>|g; # DO THIS FIRST Transform backslashes
s|\\u([a-zA-Z0-9]{4})|<<u$1>>|g; # Transform Unicode escapes
s|\\\"|<<dq>>|g; # Transform backslash double quote
s|\\\'|<<sq>>|g; # Transform backslash single quote
s|\\\#|<<lb>>|g; # Transform backslash pound
s|\\(.)|<<q$1>>|g; # Transform backslash escapes
sub restoreEscapes {
# Restore escaped characters
s|<<u0000>>|\\\\u0000|g; # Double escape U+0000