src/com/ibm/tools/translit/dumpICUrules.bat - external/github.com/unicode-org/icu - Git at Google

 @rem = '--*-Perl-*--
 @echo off
 if "%OS%" == "Windows_NT" goto WinNT
 perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9
 goto endofperl
 :WinNT
 perl -x -S "%0" %*
 if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl
 if %errorlevel% == 9009 echo You do not have Perl in your PATH.
 goto endofperl
 @rem ';
 #!perl
 #line 14

 # This perl script creates ICU transliterator data files, that live
 # in icu/data, from ICU4J UTF8 transliterator data files, in
 # icu4j/src/com/ibm/text/resources.
 #
 # The transformation that is done is very minimal.  The script assumes
 # that the input files use only # comments
 # and that they follow a rigid format.
 #
 # The output files are named according to ICU conventions (see NAME_MAP
 # below) and created in the current directory.  They should be manually
 # checked and then copied into the icu/data directory.  An ICU build must
 # then be initiated, and the standard suite of ICU transliterator tests
 # should be run after that.
 #
 # Alan Liu 5/19/00 2/27/01

 if (scalar @ARGV != 1) {
     usage();
 }
 $DIR = shift;
 if (! -d $DIR) {
     usage();
 }

 sub usage {
     my $me = $0;
     $me =~ s|.+[/\\]||;
     print "Usage: $me <dir>\n";
     print " where <dir> contains the Transliterator_*.utf8.txt\n";
     print " files.\n";
     print "e.g., $me F:/icu4j/src/com/ibm/text/resources\n";
     die;
 }

 $JAVA_ONLY = '-';

 # Mapping from Java file names to ICU file names
 %NAME_MAP = (
              "Fullwidth_Halfwidth" =>        "fullhalf",
              "Hiragana_Katakana" =>          "kana",
              "KeyboardEscape_Latin1" =>      "kbdescl1",
              "Latin_Arabic" =>               "larabic",
              "Latin_Cyrillic" =>             "lcyril",
              "Latin_Devanagari" =>           "ldevan",
              "Latin_Greek" =>                "lgreek",
              "Latin_Hebrew" =>               "lhebrew",
              "Latin_Jamo" =>                 "ljamo",
              "Latin_Kana" =>                 "lkana",
              "StraightQuotes_CurlyQuotes" => "quotes",
              "UnicodeName_UnicodeChar" =>    "ucname",

              # An ICU name of "" means the ICU name == the ID
              "Bengali_InterIndic" =>         "",
              "Devanagari_InterIndic" =>      "",
              "Gujarati_InterIndic" =>        "",
              "Gurmukhi_InterIndic" =>        "",
              "Kannada_InterIndic" =>         "",
              "Malayalam_InterIndic" =>       "",
              "Oriya_InterIndic" =>           "",
              "Tamil_InterIndic" =>           "",
              "Telugu_InterIndic" =>          "",
              "InterIndic_Bengali" =>         "",
              "InterIndic_Devanagari" =>      "",
              "InterIndic_Gujarati" =>        "",
              "InterIndic_Gurmukhi" =>        "",
              "InterIndic_Kannada" =>         "",
              "InterIndic_Malayalam" =>       "",
              "InterIndic_Oriya" =>           "",
              "InterIndic_Tamil" =>           "",
              "InterIndic_Telugu" =>          "",

              # These files are large, so ICU doesn't want them
              "Han_Pinyin" => $JAVA_ONLY,
              "Kanji_English" => $JAVA_ONLY,
              "Kanji_OnRomaji" => $JAVA_ONLY,
              );

 # Header blocks of text written at start of ICU output files
 $HEADER1 = <<END;
 //--------------------------------------------------------------------
 // Copyright (c) 1999-2001, International Business Machines
 // Corporation and others.  All Rights Reserved.
 //--------------------------------------------------------------------
 // THIS IS A MACHINE-GENERATED FILE
 END
 $HEADER2 = <<END;
 //--------------------------------------------------------------------
 END

 $TOOL = $0;

 # Iterate over all Java RBT rule files
 foreach (<$DIR/Transliterator_*.utf8.txt>) {
     next if (/~$/);
     my ($out, $id) = convertFileName($_);
     if ($out) {
         if ($out eq $JAVA_ONLY) {
             print STDERR "$id: Java only\n";
             next;
         }
         file($id, $_, $out);
     }
 }

 convertIndex();

 ######################################################################
 # Convert a Java file name to C
 # Param: Java file name of the form m|Transliterator_(.+)\.utf8\.txt$|
 # Return: A C file name (e.g., ldevan.txt) or the empty string,
 #  if there is no mapping, or $JAVA_ONLY if the given file isn't
 #  intended to be incorporated into C.
 sub convertFileName {
     local $_ = shift;
     my $id;
     if (m|Transliterator_(.+)\.utf8\.txt$|) {
         $id = $1;
     } else { die "Can't parse Java file name $_"; }
     if (!exists $NAME_MAP{$id}) {
         print STDERR "ERROR: $id not in map; please update $0\n";
         return '';
     }
     my $out = $NAME_MAP{$id};
     if ($out eq '') {
         $out = $id;
     }
     return ($out, $id);
 }

 ######################################################################
 # Convert the index file from Java to C format
 sub convertIndex {
     $JAVA_INDEX = "Transliterator_index.txt";
     $C_INDEX = "translit_index.txt";
     open(JAVA_INDEX, "$DIR/$JAVA_INDEX") or die;
     open(C_INDEX, ">$C_INDEX") or die;

     header(\*C_INDEX, $JAVA_INDEX);

     print C_INDEX <<END;
 //--------------------------------------------------------------------
 // N.B.: This file has been generated mechanically from the
 // corresponding ICU4J file, which is the master file that receives
 // primary updates.  The colon-delimited fields have been split into
 // separate strings.  For 'file' and 'internal' lines, the encoding
 // field has been deleted, since the encoding is processed at build
 // time in ICU4C.  Certain large rule sets not intended for general
 // use have been commented out with the notation "Java only".
 //--------------------------------------------------------------------

 translit_index {
   RuleBasedTransliteratorIDs {
 END

     while (<JAVA_INDEX>) {
         # Comments; change # to //
         if (s|^(\s*)\#|$1//|) {
             print C_INDEX;
             next;
         }
         # Blank lines
         if (!/\S/) {
             print C_INDEX;
             next;
         }
         # Content lines
         chomp;
         my $prefix = '';
         my @a = split(':', $_);
         if ($a[1] eq 'file' || $a[1] eq 'internal') {
             # Convert the file name
             my $id;
             ($a[2], $id) = convertFileName($a[2]);
             if ($a[2] eq $JAVA_ONLY) {
                 $prefix = '// Java only: ';
             }
             # Delete the encoding field
             splice(@a, 3, 1);
         } elsif ($a[1] eq 'alias') {
             # Pad out with extra blank fields to make the
             # 2-d array square
             push @a, "";
         } else {
             die "Can't parse $_";
         }
         print C_INDEX
             $prefix, "{ ",
             join(", ", map("\"$_\"", @a)),
             " },\n";
     }

     print C_INDEX <<END;
   }
 }
 END

     close(C_INDEX);
     close(JAVA_INDEX);
     print STDERR "$JAVA_INDEX -> $C_INDEX\n";
 }

 ######################################################################
 # Output a header
 # Param: Filehandle
 sub header {
     my $out = shift;
     my $in = shift;
     print $out $HEADER1;
     print $out "// Tool: $TOOL\n// Source: $in\n";
     print $out "// Date: ", scalar localtime, "\n";
     print $out $HEADER2;
     print $out "\n";
 }

 ######################################################################
 # Process one file
 # Param: ID, e.g. Fullwidth-Halfwidth
 # Param: Java input file name, e.g.
 #  f:/icu4j/src/com/ibm/text/resources/Transliterator_Fullwidth_Halfwidth.utf8.txt
 # Param: ICU output file name, e.g. fullhalf
 sub file {
     my $id = shift;
     my $IN = shift;
     my $out = shift;

     my $OUT = "$out.txt";

     # Show input size. Show output size later -- useful for quick sanity check.
     print "$id (", -s $IN, ") -> $OUT (";

     # Write output file header
     open(OUT, ">$OUT") or die;
     binmode OUT; # Must do this so we can write our UTF8 marker

     # Write UTF8 marker
     print OUT pack("C3", 0xEF, 0xBB, 0xBF);
     print OUT " // -*- Coding: utf-8; -*-\n";

     header(\*OUT, $IN);
     print OUT "// $id\n";
     print OUT "\n";
     print OUT "$out {\n";
     print OUT "  Rule {\n";

     open(IN, $IN) or die;
     binmode IN; # IN is a UTF8 file

     # Process each line by changing # comments to // comments
     # and taking other text and enclosing it in double quotes
     while (<IN>) {
         my $raw = $_;

         # Clean the eol junk up
         s/[\x0D\x0A]+$//;

         # Transform escaped characters
         hideEscapes();

         if (/^(\s*)(\#.*)$/) {
             # Comment-only line
             my ($white, $cmt) = ($1, $2);
             $cmt =~ s|\#|//|;
             $_ = $white . $cmt;

         } elsif (/^(\s*)(\S.*?)(\s*)(\#.*)?$/) {
             # Rule line with optional comment
             my ($white1, $rule, $white2, $cmt) = ($1, $2, $3, $4);
             $cmt =~ s|\#|//| if ($cmt);
             $_ = $white1 . '"' . $rule . '"' . $white2 . $cmt;

         } elsif (!/\S/) {
             # Blank line -- leave as-is

         } else {
             # Unparseable line
             print STDERR "Error: Can't parse line: $raw";
         }

         # Restore escaped characters
         restoreEscapes();

         print OUT $_, "\n";
     }

     # Finish up
     close(IN);
     print OUT "  }\n";
     print OUT "}\n";
     close(OUT);

     # Write output file size for sanity check
     print -s $OUT, ")\n";
 }

 ######################################################################
 sub hideEscapes {
     # Transform escaped characters
     s|\\u([a-zA-Z0-9]{4})|<<u$1>>|g; # Transform Unicode escapes
     s|\\\"|<<dq>>|; # Transform backslash double quote
     s|\\(.)|<<q$1>>|; # Transform backslash escapes
 }

 ######################################################################
 sub restoreEscapes {
     # Restore escaped characters
     s|<<dq>>|\\\"|g;
     s|<<q(.)>>|\\$1|g;
     s|<<u0000>>|\\\\u0000|g; # Double escape U+0000
     s|<<u(....)>>|\\u$1|g;
 }

 __END__
 :endofperl
	@rem = '---Perl---
	@echo off
	if "%OS%" == "Windows_NT" goto WinNT
	perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9
	goto endofperl
	:WinNT
	perl -x -S "%0" %*
	if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl
	if %errorlevel% == 9009 echo You do not have Perl in your PATH.
	goto endofperl
	@rem ';
	#!perl
	#line 14

	# This perl script creates ICU transliterator data files, that live
	# in icu/data, from ICU4J UTF8 transliterator data files, in
	# icu4j/src/com/ibm/text/resources.
	#
	# The transformation that is done is very minimal. The script assumes
	# that the input files use only # comments
	# and that they follow a rigid format.
	#
	# The output files are named according to ICU conventions (see NAME_MAP
	# below) and created in the current directory. They should be manually
	# checked and then copied into the icu/data directory. An ICU build must
	# then be initiated, and the standard suite of ICU transliterator tests
	# should be run after that.
	#
	# Alan Liu 5/19/00 2/27/01

	if (scalar @ARGV != 1) {
	usage();
	}
	$DIR = shift;
	if (! -d $DIR) {
	usage();
	}

	sub usage {
	my $me = $0;
	$me =~ s\|.+[/\\]\|\|;
	print "Usage: $me <dir>\n";
	print " where <dir> contains the Transliterator_*.utf8.txt\n";
	print " files.\n";
	print "e.g., $me F:/icu4j/src/com/ibm/text/resources\n";
	die;
	}

	$JAVA_ONLY = '-';

	# Mapping from Java file names to ICU file names
	%NAME_MAP = (
	"Fullwidth_Halfwidth" => "fullhalf",
	"Hiragana_Katakana" => "kana",
	"KeyboardEscape_Latin1" => "kbdescl1",
	"Latin_Arabic" => "larabic",
	"Latin_Cyrillic" => "lcyril",
	"Latin_Devanagari" => "ldevan",
	"Latin_Greek" => "lgreek",
	"Latin_Hebrew" => "lhebrew",
	"Latin_Jamo" => "ljamo",
	"Latin_Kana" => "lkana",
	"StraightQuotes_CurlyQuotes" => "quotes",
	"UnicodeName_UnicodeChar" => "ucname",

	# An ICU name of "" means the ICU name == the ID
	"Bengali_InterIndic" => "",
	"Devanagari_InterIndic" => "",
	"Gujarati_InterIndic" => "",
	"Gurmukhi_InterIndic" => "",
	"Kannada_InterIndic" => "",
	"Malayalam_InterIndic" => "",
	"Oriya_InterIndic" => "",
	"Tamil_InterIndic" => "",
	"Telugu_InterIndic" => "",
	"InterIndic_Bengali" => "",
	"InterIndic_Devanagari" => "",
	"InterIndic_Gujarati" => "",
	"InterIndic_Gurmukhi" => "",
	"InterIndic_Kannada" => "",
	"InterIndic_Malayalam" => "",
	"InterIndic_Oriya" => "",
	"InterIndic_Tamil" => "",
	"InterIndic_Telugu" => "",

	# These files are large, so ICU doesn't want them
	"Han_Pinyin" => $JAVA_ONLY,
	"Kanji_English" => $JAVA_ONLY,
	"Kanji_OnRomaji" => $JAVA_ONLY,
	);

	# Header blocks of text written at start of ICU output files
	$HEADER1 = <<END;
	//--------------------------------------------------------------------
	// Copyright (c) 1999-2001, International Business Machines
	// Corporation and others. All Rights Reserved.
	//--------------------------------------------------------------------
	// THIS IS A MACHINE-GENERATED FILE
	END
	$HEADER2 = <<END;
	//--------------------------------------------------------------------
	END

	$TOOL = $0;

	# Iterate over all Java RBT rule files
	foreach (<$DIR/Transliterator_*.utf8.txt>) {
	next if (/~$/);
	my ($out, $id) = convertFileName($_);
	if ($out) {
	if ($out eq $JAVA_ONLY) {
	print STDERR "$id: Java only\n";
	next;
	}
	file($id, $_, $out);
	}
	}

	convertIndex();

	######################################################################
	# Convert a Java file name to C
	# Param: Java file name of the form m\|Transliterator_(.+)\.utf8\.txt$\|
	# Return: A C file name (e.g., ldevan.txt) or the empty string,
	# if there is no mapping, or $JAVA_ONLY if the given file isn't
	# intended to be incorporated into C.
	sub convertFileName {
	local $_ = shift;
	my $id;
	if (m\|Transliterator_(.+)\.utf8\.txt$\|) {
	$id = $1;
	} else { die "Can't parse Java file name $_"; }
	if (!exists $NAME_MAP{$id}) {
	print STDERR "ERROR: $id not in map; please update $0\n";
	return '';
	}
	my $out = $NAME_MAP{$id};
	if ($out eq '') {
	$out = $id;
	}
	return ($out, $id);
	}

	######################################################################
	# Convert the index file from Java to C format
	sub convertIndex {
	$JAVA_INDEX = "Transliterator_index.txt";
	$C_INDEX = "translit_index.txt";
	open(JAVA_INDEX, "$DIR/$JAVA_INDEX") or die;
	open(C_INDEX, ">$C_INDEX") or die;

	header(\*C_INDEX, $JAVA_INDEX);

	print C_INDEX <<END;
	//--------------------------------------------------------------------
	// N.B.: This file has been generated mechanically from the
	// corresponding ICU4J file, which is the master file that receives
	// primary updates. The colon-delimited fields have been split into
	// separate strings. For 'file' and 'internal' lines, the encoding
	// field has been deleted, since the encoding is processed at build
	// time in ICU4C. Certain large rule sets not intended for general
	// use have been commented out with the notation "Java only".
	//--------------------------------------------------------------------

	translit_index {
	RuleBasedTransliteratorIDs {
	END

	while (<JAVA_INDEX>) {
	# Comments; change # to //
	if (s\|^(\s*)\#\|$1//\|) {
	print C_INDEX;
	next;
	}
	# Blank lines
	if (!/\S/) {
	print C_INDEX;
	next;
	}
	# Content lines
	chomp;
	my $prefix = '';
	my @a = split(':', $_);
	if ($a[1] eq 'file' \|\| $a[1] eq 'internal') {
	# Convert the file name
	my $id;
	($a[2], $id) = convertFileName($a[2]);
	if ($a[2] eq $JAVA_ONLY) {
	$prefix = '// Java only: ';
	}
	# Delete the encoding field
	splice(@a, 3, 1);
	} elsif ($a[1] eq 'alias') {
	# Pad out with extra blank fields to make the
	# 2-d array square
	push @a, "";
	} else {
	die "Can't parse $_";
	}
	print C_INDEX
	$prefix, "{ ",
	join(", ", map("\"$_\"", @a)),
	" },\n";
	}

	print C_INDEX <<END;
	}
	}
	END

	close(C_INDEX);
	close(JAVA_INDEX);
	print STDERR "$JAVA_INDEX -> $C_INDEX\n";
	}

	######################################################################
	# Output a header
	# Param: Filehandle
	sub header {
	my $out = shift;
	my $in = shift;
	print $out $HEADER1;
	print $out "// Tool: $TOOL\n// Source: $in\n";
	print $out "// Date: ", scalar localtime, "\n";
	print $out $HEADER2;
	print $out "\n";
	}

	######################################################################
	# Process one file
	# Param: ID, e.g. Fullwidth-Halfwidth
	# Param: Java input file name, e.g.
	# f:/icu4j/src/com/ibm/text/resources/Transliterator_Fullwidth_Halfwidth.utf8.txt
	# Param: ICU output file name, e.g. fullhalf
	sub file {
	my $id = shift;
	my $IN = shift;
	my $out = shift;

	my $OUT = "$out.txt";

	# Show input size. Show output size later -- useful for quick sanity check.
	print "$id (", -s $IN, ") -> $OUT (";

	# Write output file header
	open(OUT, ">$OUT") or die;
	binmode OUT; # Must do this so we can write our UTF8 marker

	# Write UTF8 marker
	print OUT pack("C3", 0xEF, 0xBB, 0xBF);
	print OUT " // -- Coding: utf-8; --\n";

	header(\*OUT, $IN);
	print OUT "// $id\n";
	print OUT "\n";
	print OUT "$out {\n";
	print OUT " Rule {\n";

	open(IN, $IN) or die;
	binmode IN; # IN is a UTF8 file

	# Process each line by changing # comments to // comments
	# and taking other text and enclosing it in double quotes
	while (<IN>) {
	my $raw = $_;

	# Clean the eol junk up
	s/[\x0D\x0A]+$//;

	# Transform escaped characters
	hideEscapes();

	if (/^(\s)(\#.)$/) {
	# Comment-only line
	my ($white, $cmt) = ($1, $2);
	$cmt =~ s\|\#\|//\|;
	$_ = $white . $cmt;

	} elsif (/^(\s)(\S.?)(\s)(\#.)?$/) {
	# Rule line with optional comment
	my ($white1, $rule, $white2, $cmt) = ($1, $2, $3, $4);
	$cmt =~ s\|\#\|//\| if ($cmt);
	$_ = $white1 . '"' . $rule . '"' . $white2 . $cmt;

	} elsif (!/\S/) {
	# Blank line -- leave as-is

	} else {
	# Unparseable line
	print STDERR "Error: Can't parse line: $raw";
	}

	# Restore escaped characters
	restoreEscapes();

	print OUT $_, "\n";
	}

	# Finish up
	close(IN);
	print OUT " }\n";
	print OUT "}\n";
	close(OUT);

	# Write output file size for sanity check
	print -s $OUT, ")\n";
	}

	######################################################################
	sub hideEscapes {
	# Transform escaped characters
	s\|\\u([a-zA-Z0-9]{4})\|<<u$1>>\|g; # Transform Unicode escapes
	s\|\\\"\|<<dq>>\|; # Transform backslash double quote
	s\|\\(.)\|<<q$1>>\|; # Transform backslash escapes
	}

	######################################################################
	sub restoreEscapes {
	# Restore escaped characters
	s\|<<dq>>\|\\\"\|g;
	s\|<<q(.)>>\|\\$1\|g;
	s\|<<u0000>>\|\\\\u0000\|g; # Double escape U+0000
	s\|<<u(....)>>\|\\u$1\|g;
	}

	__END__
	:endofperl