| /** |
| ******************************************************************************* |
| * Copyright (C) 2002-2010, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| |
| package com.ibm.icu.dev.tool.layout; |
| |
| import java.util.Vector; |
| |
| import com.ibm.icu.impl.Utility; |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.lang.UScript; |
| import com.ibm.icu.text.CanonicalIterator; |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.text.UnicodeSet; |
| |
| public class CanonicalCharacterData |
| { |
| private static int THRESHOLD = 4; |
| |
| public class Record |
| { |
| // TODO: might want to save arrays of Char32's rather than UTF16 strings... |
| Record(int character, int script) |
| { |
| String char32 = UCharacter.toString(character); |
| CanonicalIterator iterator = new CanonicalIterator(char32); |
| Vector equivs = new Vector(); |
| |
| composed = character; |
| |
| for (String equiv = iterator.next(); equiv != null; equiv = iterator.next()) { |
| // Skip all equivalents of length 1; it's either the original |
| // characeter or something like Angstrom for A-Ring, which we don't care about |
| if (UTF16.countCodePoint(equiv) > 1) { |
| equivs.add(equiv); |
| } |
| } |
| |
| int nEquivalents = equivs.size(); |
| |
| if (nEquivalents > maxEquivalents[script]) { |
| maxEquivalents[script] = nEquivalents; |
| } |
| |
| if (nEquivalents > 0) { |
| equivalents = new String[nEquivalents]; |
| |
| if (nEquivalents > THRESHOLD) { |
| dumpEquivalents(character, equivs); |
| } |
| |
| sortEquivalents(equivalents, equivs); |
| } |
| } |
| |
| public int getComposedCharacter() |
| { |
| return composed; |
| } |
| |
| public int countEquivalents() |
| { |
| if (equivalents == null) { |
| return 0; |
| } |
| |
| return equivalents.length; |
| } |
| |
| public String[] getEquivalents() |
| { |
| return equivalents; |
| } |
| |
| public String getEquivalent(int index) |
| { |
| if (equivalents == null || index < 0 || index >= equivalents.length) { |
| return null; |
| } |
| |
| return equivalents[index]; |
| } |
| |
| private void dumpEquivalents(int character, Vector equivs) |
| { |
| int count = equivs.size(); |
| |
| System.out.println(Utility.hex(character, 6) + " - " + count + ":"); |
| |
| for (int i = 0; i < count; i += 1) { |
| String equiv = (String) equivs.elementAt(i); |
| int codePoints = UTF16.countCodePoint(equiv); |
| |
| for (int c = 0; c < codePoints; c += 1) { |
| if (c > 0) { |
| System.out.print(" "); |
| } |
| |
| System.out.print(Utility.hex(UTF16.charAt(equiv, c), 6)); |
| } |
| |
| System.out.println(); |
| } |
| |
| System.out.println(); |
| } |
| |
| private int composed; |
| private String[] equivalents = null; |
| } |
| |
| public CanonicalCharacterData() |
| { |
| // nothing to do... |
| } |
| |
| public void add(int character) |
| { |
| int script = UScript.getScript(character); |
| Vector recordVector = recordVectors[script]; |
| |
| if (recordVector == null) { |
| recordVector = recordVectors[script] = new Vector(); |
| } |
| |
| recordVector.add(new Record(character, script)); |
| } |
| |
| public int getMaxEquivalents(int script) |
| { |
| if (script < 0 || script >= UScript.CODE_LIMIT) { |
| return 0; |
| } |
| |
| return maxEquivalents[script]; |
| } |
| |
| public Record getRecord(int script, int index) |
| { |
| if (script < 0 || script >= UScript.CODE_LIMIT) { |
| return null; |
| } |
| |
| Vector recordVector = recordVectors[script]; |
| |
| if (recordVector == null || index < 0 || index >= recordVector.size()) { |
| return null; |
| } |
| |
| return (Record) recordVector.elementAt(index); |
| } |
| |
| public int countRecords(int script) |
| { |
| if (script < 0 || script >= UScript.CODE_LIMIT || |
| recordVectors[script] == null) { |
| return 0; |
| } |
| |
| return recordVectors[script].size(); |
| } |
| |
| public static CanonicalCharacterData factory(UnicodeSet characterSet) |
| { |
| int charCount = characterSet.size(); |
| CanonicalCharacterData data = new CanonicalCharacterData(); |
| |
| System.out.println("There are " + charCount + " characters with a canonical decomposition."); |
| |
| for (int i = 0; i < charCount; i += 1) { |
| data.add(characterSet.charAt(i)); |
| } |
| |
| return data; |
| } |
| |
| private static int compareEquivalents(String a, String b) |
| { |
| int result = UTF16.countCodePoint(a) - UTF16.countCodePoint(b); |
| |
| if (result == 0) { |
| return a.compareTo(b); |
| } |
| |
| return result; |
| } |
| |
| // |
| // Straight insertion sort from Knuth vol. III, pg. 81 |
| // |
| private static void sortEquivalents(String[] equivalents, Vector unsorted) |
| { |
| int nEquivalents = equivalents.length; |
| |
| for (int e = 0; e < nEquivalents; e += 1) { |
| String v = (String) unsorted.elementAt(e); |
| int i; |
| |
| for (i = e - 1; i >= 0; i -= 1) { |
| if (compareEquivalents(v, equivalents[i]) >= 0) { |
| break; |
| } |
| |
| equivalents[i + 1] = equivalents[i]; |
| } |
| |
| equivalents[i + 1] = v; |
| } |
| } |
| |
| private Vector recordVectors[] = new Vector[UScript.CODE_LIMIT]; |
| private int maxEquivalents[] = new int[UScript.CODE_LIMIT]; |
| |
| } |