blob: 4621197a76259459655984ece8a26c5ccbc4a428 [file] [log] [blame]
/**
*******************************************************************************
* Copyright (C) 2000-2010, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.dev.test.translit;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
public final class TestUtility {
public static String hex(char ch) {
String foo = Integer.toString(ch,16).toUpperCase();
return "0000".substring(0,4-foo.length()) + foo;
}
public static String hex(int ch) {
String foo = Integer.toString(ch,16).toUpperCase();
return "00000000".substring(0,4-foo.length()) + foo;
}
public static String hex(String s) {
return hex(s,",");
}
public static String hex(String s, String sep) {
if (s.length() == 0) return "";
String result = hex(s.charAt(0));
for (int i = 1; i < s.length(); ++i) {
result += sep;
result += hex(s.charAt(i));
}
return result;
}
public static String replace(String source, String toBeReplaced, String replacement) {
StringBuffer results = new StringBuffer();
int len = toBeReplaced.length();
for (int i = 0; i < source.length(); ++i) {
if (source.regionMatches(false, i, toBeReplaced, 0, len)) {
results.append(replacement);
i += len - 1; // minus one, since we will increment
} else {
results.append(source.charAt(i));
}
}
return results.toString();
}
public static String replaceAll(String source, UnicodeSet set, String replacement) {
StringBuffer results = new StringBuffer();
int cp;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source,i);
if (set.contains(cp)) {
results.append(replacement);
} else {
UTF16.append(results, cp);
}
}
return results.toString();
}
// COMMENTED OUT ALL THE OLD SCRIPT STUFF
/*
public static byte getScript(char c) {
return getScript(getBlock(c));
}
public static byte getScript(byte block) {
return blockToScript[block];
}
public static byte getBlock(char c) {
int index = c >> 7;
byte block = charToBlock[index];
while (block < 0) { // take care of exceptions, blocks split across 128 boundaries
int[] tuple = split[-block-1];
if (c < tuple[0]) block = (byte)tuple[1];
else block = (byte)tuple[2];
}
return block;
}
// returns next letter of script, or 0xFFFF if done
public static char getNextLetter(char c, byte script) {
while (c < 0xFFFF) {
++c;
if (getScript(c) == script && Character.isLetter(c)) {
return c;
}
}
return c;
}
// Supplements to Character methods; these methods go through
// UCharacter if possible. If not, they fall back to Character.
public static boolean isUnassigned(char c) {
try {
return UCharacter.getType(c) == UCharacterCategory.UNASSIGNED;
} catch (NullPointerException e) {
System.out.print("");
}
return Character.getType(c) == Character.UNASSIGNED;
}
public static boolean isLetter(char c) {
try {
return UCharacter.isLetter(c);
} catch (NullPointerException e) {
System.out.print("");
}
return Character.isLetter(c);
}
public static void main(String[] args) {
System.out.println("Blocks: ");
byte lastblock = -128;
for (char cc = 0; cc < 0xFFFF; ++cc) {
byte block = TestUtility.getBlock(cc);
if (block != lastblock) {
System.out.println(TestUtility.hex(cc) + "\t" + block);
lastblock = block;
}
}
System.out.println();
System.out.println("Scripts: ");
byte lastScript = -128;
for (char cc = 0; cc < 0xFFFF; ++cc) {
byte script = TestUtility.getScript(cc);
if (script != lastScript) {
System.out.println(TestUtility.hex(cc) + "\t" + script);
lastScript = script;
}
}
}
public static final byte // SCRIPT CODE
COMMON_SCRIPT = 0,
LATIN_SCRIPT = 1,
GREEK_SCRIPT = 2,
CYRILLIC_SCRIPT = 3,
ARMENIAN_SCRIPT = 4,
HEBREW_SCRIPT = 5,
ARABIC_SCRIPT = 6,
SYRIAC_SCRIPT = 7,
THAANA_SCRIPT = 8,
DEVANAGARI_SCRIPT = 9,
BENGALI_SCRIPT = 10,
GURMUKHI_SCRIPT = 11,
GUJARATI_SCRIPT = 12,
ORIYA_SCRIPT = 13,
TAMIL_SCRIPT = 14,
TELUGU_SCRIPT = 15,
KANNADA_SCRIPT = 16,
MALAYALAM_SCRIPT = 17,
SINHALA_SCRIPT = 18,
THAI_SCRIPT = 19,
LAO_SCRIPT = 20,
TIBETAN_SCRIPT = 21,
MYANMAR_SCRIPT = 22,
GEORGIAN_SCRIPT = 23,
JAMO_SCRIPT = 24,
HANGUL_SCRIPT = 25,
ETHIOPIC_SCRIPT = 26,
CHEROKEE_SCRIPT = 27,
ABORIGINAL_SCRIPT = 28,
OGHAM_SCRIPT = 29,
RUNIC_SCRIPT = 30,
KHMER_SCRIPT = 31,
MONGOLIAN_SCRIPT = 32,
HIRAGANA_SCRIPT = 33,
KATAKANA_SCRIPT = 34,
BOPOMOFO_SCRIPT = 35,
HAN_SCRIPT = 36,
YI_SCRIPT = 37;
public static final byte // block code
RESERVED_BLOCK = 0,
BASIC_LATIN = 1,
LATIN_1_SUPPLEMENT = 2,
LATIN_EXTENDED_A = 3,
LATIN_EXTENDED_B = 4,
IPA_EXTENSIONS = 5,
SPACING_MODIFIER_LETTERS = 6,
COMBINING_DIACRITICAL_MARKS = 7,
GREEK = 8,
CYRILLIC = 9,
ARMENIAN = 10,
HEBREW = 11,
ARABIC = 12,
SYRIAC = 13,
THAANA = 14,
DEVANAGARI = 15,
BENGALI = 16,
GURMUKHI = 17,
GUJARATI = 18,
ORIYA = 19,
TAMIL = 20,
TELUGU = 21,
KANNADA = 22,
MALAYALAM = 23,
SINHALA = 24,
THAI = 25,
LAO = 26,
TIBETAN = 27,
MYANMAR = 28,
GEORGIAN = 29,
HANGUL_JAMO = 30,
ETHIOPIC = 31,
CHEROKEE = 32,
UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33,
OGHAM = 34,
RUNIC = 35,
KHMER = 36,
MONGOLIAN = 37,
LATIN_EXTENDED_ADDITIONAL = 38,
GREEK_EXTENDED = 39,
GENERAL_PUNCTUATION = 40,
SUPERSCRIPTS_AND_SUBSCRIPTS = 41,
CURRENCY_SYMBOLS = 42,
COMBINING_MARKS_FOR_SYMBOLS = 43,
LETTERLIKE_SYMBOLS = 44,
NUMBER_FORMS = 45,
ARROWS = 46,
MATHEMATICAL_OPERATORS = 47,
MISCELLANEOUS_TECHNICAL = 48,
CONTROL_PICTURES = 49,
OPTICAL_CHARACTER_RECOGNITION = 50,
ENCLOSED_ALPHANUMERICS = 51,
BOX_DRAWING = 52,
BLOCK_ELEMENTS = 53,
GEOMETRIC_SHAPES = 54,
MISCELLANEOUS_SYMBOLS = 55,
DINGBATS = 56,
BRAILLE_PATTERNS = 57,
CJK_RADICALS_SUPPLEMENT = 58,
KANGXI_RADICALS = 59,
IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60,
CJK_SYMBOLS_AND_PUNCTUATION = 61,
HIRAGANA = 62,
KATAKANA = 63,
BOPOMOFO = 64,
HANGUL_COMPATIBILITY_JAMO = 65,
KANBUN = 66,
BOPOMOFO_EXTENDED = 67,
ENCLOSED_CJK_LETTERS_AND_MONTHS = 68,
CJK_COMPATIBILITY = 69,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70,
CJK_UNIFIED_IDEOGRAPHS = 71,
YI_SYLLABLES = 72,
YI_RADICALS = 73,
HANGUL_SYLLABLES = 74,
HIGH_SURROGATES = 75,
HIGH_PRIVATE_USE_SURROGATES = 76,
LOW_SURROGATES = 77,
PRIVATE_USE = 78,
CJK_COMPATIBILITY_IDEOGRAPHS = 79,
ALPHABETIC_PRESENTATION_FORMS = 80,
ARABIC_PRESENTATION_FORMS_A = 81,
COMBINING_HALF_MARKS = 82,
CJK_COMPATIBILITY_FORMS = 83,
SMALL_FORM_VARIANTS = 84,
ARABIC_PRESENTATION_FORMS_B = 85,
SPECIALS = 86,
HALFWIDTH_AND_FULLWIDTH_FORMS = 87;
static final byte[] blockToScript = {
COMMON_SCRIPT, // 0, <RESERVED_BLOCK>
LATIN_SCRIPT, // 1, BASIC_LATIN
LATIN_SCRIPT, // 2, LATIN_1_SUPPLEMENT
LATIN_SCRIPT, // 3, LATIN_EXTENDED_A
LATIN_SCRIPT, // 4, LATIN_EXTENDED_B
LATIN_SCRIPT, // 5, IPA_EXTENSIONS
COMMON_SCRIPT, // 6, SPACING_MODIFIER_LETTERS
COMMON_SCRIPT, // 7, COMBINING_DIACRITICAL_MARKS
GREEK_SCRIPT, // 8, GREEK
CYRILLIC_SCRIPT, // 9, CYRILLIC
ARMENIAN_SCRIPT, // 10, ARMENIAN
HEBREW_SCRIPT, // 11, HEBREW
ARABIC_SCRIPT, // 12, ARABIC
SYRIAC_SCRIPT, // 13, SYRIAC
THAANA_SCRIPT, // 14, THAANA
DEVANAGARI_SCRIPT, // 15, DEVANAGARI
BENGALI_SCRIPT, // 16, BENGALI
GURMUKHI_SCRIPT, // 17, GURMUKHI
GUJARATI_SCRIPT, // 18, GUJARATI
ORIYA_SCRIPT, // 19, ORIYA
TAMIL_SCRIPT, // 20, TAMIL
TELUGU_SCRIPT, // 21, TELUGU
KANNADA_SCRIPT, // 22, KANNADA
MALAYALAM_SCRIPT, // 23, MALAYALAM
SINHALA_SCRIPT, // 24, SINHALA
THAI_SCRIPT, // 25, THAI
LAO_SCRIPT, // 26, LAO
TIBETAN_SCRIPT, // 27, TIBETAN
MYANMAR_SCRIPT, // 28, MYANMAR
GEORGIAN_SCRIPT, // 29, GEORGIAN
JAMO_SCRIPT, // 30, HANGUL_JAMO
ETHIOPIC_SCRIPT, // 31, ETHIOPIC
CHEROKEE_SCRIPT, // 32, CHEROKEE
ABORIGINAL_SCRIPT, // 33, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
OGHAM_SCRIPT, // 34, OGHAM
RUNIC_SCRIPT, // 35, RUNIC
KHMER_SCRIPT, // 36, KHMER
MONGOLIAN_SCRIPT, // 37, MONGOLIAN
LATIN_SCRIPT, // 38, LATIN_EXTENDED_ADDITIONAL
GREEK_SCRIPT, // 39, GREEK_EXTENDED
COMMON_SCRIPT, // 40, GENERAL_PUNCTUATION
COMMON_SCRIPT, // 41, SUPERSCRIPTS_AND_SUBSCRIPTS
COMMON_SCRIPT, // 42, CURRENCY_SYMBOLS
COMMON_SCRIPT, // 43, COMBINING_MARKS_FOR_SYMBOLS
COMMON_SCRIPT, // 44, LETTERLIKE_SYMBOLS
COMMON_SCRIPT, // 45, NUMBER_FORMS
COMMON_SCRIPT, // 46, ARROWS
COMMON_SCRIPT, // 47, MATHEMATICAL_OPERATORS
COMMON_SCRIPT, // 48, MISCELLANEOUS_TECHNICAL
COMMON_SCRIPT, // 49, CONTROL_PICTURES
COMMON_SCRIPT, // 50, OPTICAL_CHARACTER_RECOGNITION
COMMON_SCRIPT, // 51, ENCLOSED_ALPHANUMERICS
COMMON_SCRIPT, // 52, BOX_DRAWING
COMMON_SCRIPT, // 53, BLOCK_ELEMENTS
COMMON_SCRIPT, // 54, GEOMETRIC_SHAPES
COMMON_SCRIPT, // 55, MISCELLANEOUS_SYMBOLS
COMMON_SCRIPT, // 56, DINGBATS
COMMON_SCRIPT, // 57, BRAILLE_PATTERNS
HAN_SCRIPT, // 58, CJK_RADICALS_SUPPLEMENT
HAN_SCRIPT, // 59, KANGXI_RADICALS
HAN_SCRIPT, // 60, IDEOGRAPHIC_DESCRIPTION_CHARACTERS
COMMON_SCRIPT, // 61, CJK_SYMBOLS_AND_PUNCTUATION
HIRAGANA_SCRIPT, // 62, HIRAGANA
KATAKANA_SCRIPT, // 63, KATAKANA
BOPOMOFO_SCRIPT, // 64, BOPOMOFO
JAMO_SCRIPT, // 65, HANGUL_COMPATIBILITY_JAMO
HAN_SCRIPT, // 66, KANBUN
BOPOMOFO_SCRIPT, // 67, BOPOMOFO_EXTENDED
COMMON_SCRIPT, // 68, ENCLOSED_CJK_LETTERS_AND_MONTHS
COMMON_SCRIPT, // 69, CJK_COMPATIBILITY
HAN_SCRIPT, // 70, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
HAN_SCRIPT, // 71, CJK_UNIFIED_IDEOGRAPHS
YI_SCRIPT, // 72, YI_SYLLABLES
YI_SCRIPT, // 73, YI_RADICALS
HANGUL_SCRIPT, // 74, HANGUL_SYLLABLES
COMMON_SCRIPT, // 75, HIGH_SURROGATES
COMMON_SCRIPT, // 76, HIGH_PRIVATE_USE_SURROGATES
COMMON_SCRIPT, // 77, LOW_SURROGATES
COMMON_SCRIPT, // 78, PRIVATE_USE
HAN_SCRIPT, // 79, CJK_COMPATIBILITY_IDEOGRAPHS
COMMON_SCRIPT, // 80, ALPHABETIC_PRESENTATION_FORMS
ARABIC_SCRIPT, // 81, ARABIC_PRESENTATION_FORMS_A
COMMON_SCRIPT, // 82, COMBINING_HALF_MARKS
COMMON_SCRIPT, // 83, CJK_COMPATIBILITY_FORMS
COMMON_SCRIPT, // 84, SMALL_FORM_VARIANTS
ARABIC_SCRIPT, // 85, ARABIC_PRESENTATION_FORMS_B
COMMON_SCRIPT, // 86, SPECIALS
COMMON_SCRIPT, // 87, HALFWIDTH_AND_FULLWIDTH_FORMS
COMMON_SCRIPT, // 88, SPECIALS
};
// could be further reduced to a byte array, but I didn't bother.
static final int[][] split = {
{0x0250, 4, 5}, // -1
{0x02B0, 5, 6}, // -2
{0x0370, 7, 8}, // -3
{0x0530, 0, 10}, // -4
{0x0590, 10, 11}, // -5
{0x0750, 13, 0}, // -6
{0x07C0, 14, 0}, // -7
{0x10A0, 28, 29}, // -8
{0x13A0, 0, 32}, // -9
{0x16A0, 34, 35}, // -10
{0x18B0, 37, 0}, // -11
{0x2070, 40, 41}, // -12
{0x20A0, 41, -31}, // -13
{0x2150, 44, 45}, // -14
{0x2190, 45, 46}, // -15
{0x2440, 49, -32}, // -16
{0x25A0, 53, 54}, // -17
{0x27C0, 56, 0}, // -18
{0x2FE0, 59, -33}, // -19
{0x3040, 61, 62}, // -20
{0x30A0, 62, 63}, // -21
{0x3130, 64, 65}, // -22
{0x3190, 65, -34}, // -23
{0x4DB6, 70, 0}, // -24
{0xA490, 72, -35}, // -25
{0xD7A4, 74, 0}, // -26
{0xFB50, 80, 81}, // -27
{0xFE20, 0, -36}, // -28
{0xFEFF, 85, 86}, // -29
{0xFFF0, 87, -37}, // -30
{0x20D0, 42, 43}, // -31
{0x2460, 50, 51}, // -32
{0x2FF0, 0, 60}, // -33
{0x31A0, 66, -38}, // -34
{0xA4D0, 73, 0}, //-35
{0xFE30, 82, -39}, //-36
{0xFFFE, 88, 0}, //-37
{0x31C0, 67, 0}, // -38
{0xFE50, 83, -40}, //-39
{0xFE70, 84, 85} // -40
};
static final byte[] charToBlock = {
1, 2, 3, 4, -1, -2, -3, 8, 9, 9, -4, -5, 12, 12, -6, -7,
0, 0, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 27,
28, -8, 30, 30, 31, 31, 31, -9, 33, 33, 33, 33, 33, -10, 0, 36,
37, -11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 38, 39, 39,
-12, -13, -14, -15, 47, 47, 48, 48, -16, 51, 52, -17, 55, 55, 56, -18,
57, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 59, -19,
-20, -21, -22, -23, 68, 68, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70,
70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -24, 71, 71, 71, 71,
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
72, 72, 72, 72, 72, 72, 72, 72, 72, -25, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 74, 74, 74, 74, 74, 74, 74, 74,
74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -26,
75, 75, 75, 75, 75, 75, 75, 76, 77, 77, 77, 77, 77, 77, 77, 77,
78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
78, 78, 79, 79, 79, 79, -27, 81, 81, 81, 81, 81, -28, -29, 87, -30
};
*/
}