| package com.ibm.text; |
| import com.ibm.icu.text.UnicodeSet; |
| import com.ibm.icu.text.UnicodeSetIterator; |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.text.Normalizer; |
| import com.ibm.icu.lang.UCharacter; |
| import java.util.BitSet; |
| import java.util.Set; |
| import java.util.HashSet; |
| import java.util.TreeSet; |
| import java.util.Iterator; |
| import java.text.NumberFormat; |
| import com.ibm.text.utility.FastIntBinarySearch; |
| |
| public class TestICU4J { |
| public static void main(String[] args) { |
| String a = UTF16.valueOf(0x10000); |
| String b = Normalizer.normalize("a\u0308", Normalizer.NFC); |
| System.out.println(b); |
| /* |
| System.out.println(UCharacter.getType(0x10FFFF)); |
| System.out.println(UCharacter.getName(0x61)); |
| */ |
| testUnicodeSetSpeed(Character.TITLECASE_LETTER, 100); |
| testUnicodeSetSpeed(Character.UNASSIGNED, 1); |
| } |
| |
| static final boolean SHOW_ERRORS = false; |
| static boolean OPTIMIZATION = true; |
| |
| static void testUnicodeSetSpeed(int prop, int ITERATIONS) { |
| NumberFormat numb = NumberFormat.getNumberInstance(); |
| NumberFormat percent = NumberFormat.getPercentInstance(); |
| double start, delta, oldDelta; |
| int temp = 0; |
| Set s; |
| UnicodeSet us; |
| Iterator it; |
| UnicodeSetIterator uit; |
| |
| BitSet bs = new BitSet(); |
| System.out.println(); |
| System.out.println("Getting characters for property " + prop); |
| int total = 0; |
| for (int cp = 0; cp < 0x10FFFF; ++cp) { |
| if (UCharacter.getType(cp) == prop) { |
| bs.set(cp); |
| ++total; |
| } |
| } |
| System.out.println("Total characters: " + numb.format(total)); |
| System.out.println("Loop Iterations: " + numb.format(ITERATIONS)); |
| System.out.println(); |
| |
| System.out.println("Testing Add speed"); |
| |
| s = new TreeSet(); |
| start = System.currentTimeMillis(); |
| for (int i = 0; i < ITERATIONS; ++i) { |
| s.clear(); |
| for (int cp = 0; cp <= 0x10FFFF; ++cp) { |
| if (bs.get(cp)) { |
| s.add(new Integer(cp)); |
| } |
| } |
| } |
| oldDelta = delta = (System.currentTimeMillis() - start)/ITERATIONS; |
| System.out.println("Set add time: " + numb.format(delta)); |
| System.out.println("Total characters: " + numb.format(s.size())); |
| |
| us = new UnicodeSet(); |
| start = System.currentTimeMillis(); |
| for (int i = 0; i < ITERATIONS; ++i) { |
| us.clear(); |
| for (int cp = 0; cp <= 0x10FFFF; ++cp) { |
| if (bs.get(cp)) { |
| optimizedAdd(us,cp); |
| } |
| } |
| } |
| optimizedDone(us); |
| delta = (System.currentTimeMillis() - start)/ITERATIONS; |
| System.out.println("UnicodeSet add time: " + numb.format(delta) + ", " + percent.format(delta/oldDelta)); |
| System.out.println("Total characters: " + numb.format(us.size()) + ", ranges: " + us.getRangeCount()); |
| |
| System.out.println(); |
| System.out.println("Testing Contains speed"); |
| |
| start = System.currentTimeMillis(); |
| for (int i = 0; i < ITERATIONS; ++i) { |
| for (int cp = 0; cp <= 0x10FFFF; ++cp) { |
| if (s.contains(new Integer(cp)) != bs.get(cp)) { |
| if (SHOW_ERRORS) System.out.println("Error at: " + info(cp)); |
| } |
| } |
| } |
| oldDelta = delta = (System.currentTimeMillis() - start)/ITERATIONS; |
| System.out.println("Set contains time: " + numb.format(delta)); |
| |
| start = System.currentTimeMillis(); |
| for (int i = 0; i < ITERATIONS; ++i) { |
| for (int cp = 0; cp <= 0x10FFFF; ++cp) { |
| if (us.contains(cp) != bs.get(cp)) { |
| if (SHOW_ERRORS) System.out.println("Error at: " + info(cp)); |
| } |
| } |
| } |
| delta = (System.currentTimeMillis() - start)/ITERATIONS; |
| System.out.println("UnicodeSet contains time: " + numb.format(delta) + ", " + percent.format(delta/oldDelta)); |
| |
| setupBinary(us); |
| start = System.currentTimeMillis(); |
| for (int i = 0; i < ITERATIONS; ++i) { |
| for (int cp = 0; cp <= 0x10FFFF; ++cp) { |
| if (binaryContains(cp) != bs.get(cp)) { |
| if (SHOW_ERRORS) System.out.println("Error at: " + info(cp)); |
| } |
| } |
| } |
| delta = (System.currentTimeMillis() - start)/ITERATIONS; |
| System.out.println("BINARY UnicodeSet contains time: " + numb.format(delta) + ", " + percent.format(delta/oldDelta)); |
| |
| System.out.println("Testing Iteration speed"); |
| |
| start = System.currentTimeMillis(); |
| for (int i = 0; i < ITERATIONS; ++i) { |
| it = s.iterator(); |
| while (it.hasNext()) { |
| temp += ((Integer)it.next()).intValue(); |
| } |
| } |
| oldDelta = delta = (System.currentTimeMillis() - start)/ITERATIONS; |
| System.out.println("Set iteration time: " + numb.format(delta)); |
| |
| uit = new UnicodeSetIterator(us); |
| start = System.currentTimeMillis(); |
| for (int i = 0; i < ITERATIONS; ++i) { |
| uit.reset(); |
| while (uit.next()) { |
| temp += uit.codepoint; |
| } |
| } |
| delta = (System.currentTimeMillis() - start)/ITERATIONS; |
| System.out.println("UnicodeSet iteration time: " + numb.format(delta) + ", " + percent.format(delta/oldDelta)); |
| |
| uit.reset(); |
| start = System.currentTimeMillis(); |
| while (uit.nextRange()) { |
| System.out.println(info(uit.codepoint, uit.codepointEnd)); |
| } |
| } |
| |
| static FastIntBinarySearch fibs; |
| |
| static void setupBinary(UnicodeSet us) { |
| int[] dummySearch = new int[us.getRangeCount()*2]; |
| int dummyLimit = 0; |
| UnicodeSetIterator uit = new UnicodeSetIterator(us); |
| while (uit.nextRange()) { |
| dummySearch[dummyLimit++] = uit.codepoint; |
| dummySearch[dummyLimit++] = uit.codepointEnd+1; |
| } |
| fibs = new FastIntBinarySearch(dummySearch); |
| } |
| |
| static boolean binaryContains(int cp) { |
| return ((fibs.findIndex(cp) & 1) != 0); // return true if odd |
| } |
| |
| |
| static String info(int cp) { |
| return Integer.toString(cp, 16).toUpperCase() + " " + UCharacter.getName(cp); |
| } |
| |
| static String info(int cpStart, int cpEnd) { |
| if (cpStart == cpEnd) { |
| return Integer.toString(cpStart, 16).toUpperCase() |
| + " " + UCharacter.getName(cpStart); |
| } |
| return Integer.toString(cpStart, 16).toUpperCase() + ".." + Integer.toString(cpEnd, 16).toUpperCase() |
| + " " + UCharacter.getName(cpStart) + ".." + UCharacter.getName(cpEnd); |
| } |
| |
| static int first; |
| static int limit = -2; |
| |
| static void optimizedAdd(UnicodeSet us, int cp) { |
| if (!OPTIMIZATION) { |
| us.add(cp); |
| return; |
| } |
| if (cp == limit) { |
| ++limit; |
| } else { |
| if (limit > 0) { |
| us.add(first, limit - 1); |
| // System.out.println(info(first, limit-1)); |
| } |
| first = cp; |
| limit = cp + 1; |
| } |
| } |
| |
| static void optimizedDone(UnicodeSet us) { |
| if (!OPTIMIZATION) return; |
| if (limit > 0) { |
| us.add(first, limit - 1); |
| //System.out.println(info(first, limit-1)); |
| } |
| limit = -2; // reset to invalid |
| } |
| |
| |
| public static class UXCharacter { |
| /** |
| * Provides interface for properties in |
| * http://www.unicode.org/Public/UNIDATA/PropertyAliases.txt |
| * and their values in |
| * http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt |
| */ |
| |
| /** |
| * Tests a particular code point to see if the cited property has the given value. |
| * |
| * Sample: the following are equivalent |
| * <pre> |
| * if (UCharacter.test("LB", "AL", cp)) ... |
| * if (UCharacter.test("line break", "alphabetic", cp)) ... |
| * </pre> |
| * |
| */ |
| public static boolean test(String propertyName, String propertyValue, int codePoint) { |
| return false; |
| } |
| |
| /** |
| * Produces a UnicodeSet of code points that have the given propertyvalue for the given property. |
| * @param set the resulting value. The set is cleared, |
| * then all the code points with the given <property, value> are added. |
| * |
| * Sample: the following are equivalent |
| * <pre> |
| * if (UCharacter.test("WSpace", cp)) ... |
| * if (UCharacter.test("White_Space", cp)) ... |
| * if (UCharacter.test("White_Space", "true", cp)) ... |
| * if (!UCharacter.test("White_Space", "false", cp)) ... |
| * </pre> |
| * |
| */ |
| public static void getSet(String propertyName, String propertyValue, UnicodeSet set) { |
| // logical implemenation. Real implementation would be way faster! |
| set.clear(); |
| for (int cp = 0; cp <= 0x10FFFF; ++cp) { |
| if (test(propertyName, propertyValue, cp)) set.add(cp); |
| } |
| } |
| |
| // ====================================================== |
| // POSSIBLE ADDITIONAL UTILITIES FOR CONVENIENCE OR SPEED |
| // ====================================================== |
| |
| /** |
| * Tests a particular code point to see if the cited boolean property is true. |
| * @param propertyName the cited property |
| * @param codePoint the particular code point |
| * @return true if the cited property has the given value for the specified code point. |
| * |
| * Sample: the following are equivalent |
| * <pre> |
| * if (UCharacter.test("WSpace", cp)) ... |
| * if (UCharacter.test("White_Space", cp)) ... |
| * if (UCharacter.test("White_Space", "true", cp)) ... |
| * if (!UCharacter.test("White_Space", "false", cp)) ... |
| * </pre> |
| * |
| */ |
| public static boolean test(String booleanPropertyName, int codePoint) { |
| return test(booleanPropertyName, "true", codePoint); |
| } |
| |
| // =============================================== |
| // The following allow access to properties by number, saving a string lookup |
| // on each call. |
| // =============================================== |
| |
| |
| /** |
| * Gets an index for higher-speed access to properties. |
| * |
| * Sample: |
| * <pre> |
| * int prop = UCharacter.getPropertyIndexIndex("LB"); |
| * int value = UCharacter.getValueIndex("LB", "AL"); |
| * while (true) { |
| * ... |
| * if (test(prop, value, codePoint)) ... |
| * </pre> |
| * |
| */ |
| public static int getPropertyIndex(String propertyName) { |
| return 0; |
| } |
| |
| /** |
| * Gets maximum property index, used for iterating through properties |
| * |
| */ |
| public static int getMaxPropertyIndex() { |
| return 0; |
| } |
| |
| |
| static final byte // NAME_STYLE |
| SHORT = 0, |
| DEFAULT = 1, |
| LONG = 2; |
| |
| /** |
| * Gets property name |
| * |
| */ |
| public static String getPropertyName(int propertyIndex, byte namestyle) { |
| return ""; |
| } |
| |
| /* |
| * Tests a particular code point to see if the cited property has the given value. |
| */ |
| public static boolean test(int propertyIndex, String propertyValue, int codePoint) { |
| return false; |
| } |
| |
| /** |
| * Produces a UnicodeSet of code points that have the given propertyvalue for the given property. |
| */ |
| public static void getSet(int propertyIndex, String propertyValue, UnicodeSet set) { |
| } |
| |
| // =============================================== |
| // The following allow access to enumerated property values by number, |
| // saving a string lookup on each call. |
| // They are only valid for enumerated properties |
| // including the combining character class (0..255). |
| // =============================================== |
| |
| /** |
| * Gets an index for higher-speed access to property values. |
| * Only valid for enumerated properties. |
| */ |
| public static int getValueIndex(String propertyName, String propertyValue) { |
| return 0; |
| } |
| |
| /** |
| * Gets maximum value index for a given property, used for iterating through property values. |
| * Only valid for enumerated properties. |
| * |
| */ |
| public static int getMaxValueIndex(int propertyIndex) { |
| return 0; |
| } |
| |
| /** |
| * Gets property value, corresponding to one of the values passed in |
| * |
| */ |
| public static String getValueName(int propertyIndex, int valueIndex, byte namestyle) { |
| return ""; |
| } |
| |
| /* |
| * Tests a particular code point to see if the cited property has the given value. |
| */ |
| public static boolean test(int propertyIndex, int valueIndex, int codePoint) { |
| return false; |
| } |
| |
| /** |
| * Produces a UnicodeSet of code points that have the given propertyvalue for the given property. |
| */ |
| public static void getSet(int propertyIndex, int valueIndex, UnicodeSet set) { |
| } |
| |
| |
| /* OPEN ISSUES: |
| - Don't like the names of the functions. Any better options? test => hasValue? hasPropertyValue? |
| - Should getSet really ADD to the set (avoiding the clear?) and be called addProperties? |
| Maybe faster sometimes, but might also be more errorprone. |
| */ |
| |
| } |
| } |