| /** |
| ******************************************************************************* |
| * Copyright (C) 1996-2001, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $ |
| * $Date: 2006/09/24 23:32:44 $ |
| * $Revision: 1.18 $ |
| * |
| ******************************************************************************* |
| */ |
| |
| package com.ibm.text.UCD; |
| |
| import java.util.*; |
| |
| import com.ibm.icu.dev.test.util.UnicodeMap; |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.text.UnicodeSet; |
| |
| import com.ibm.text.utility.*; |
| import com.sun.java_cup.internal.internal_error; |
| |
| |
| /** |
| * Implements Unicode Normalization Forms C, D, KC, KD.<br> |
| * See UTR#15 for details.<br> |
| * Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br> |
| * The Unicode Consortium makes no expressed or implied warranty of any |
| * kind, and assumes no liability for errors or omissions. |
| * No liability is assumed for incidental and consequential damages |
| * in connection with or arising out of the use of the information here. |
| * @author Mark Davis |
| */ |
| |
| public final class Normalizer implements UCD_Types { |
| public static final String copyright = |
| "Copyright (C) 2000, IBM Corp. and others. All Rights Reserved."; |
| |
| public static boolean SHOW_PROGRESS = false; |
| |
| /** |
| * Create a normalizer for a given form. |
| */ |
| public Normalizer(byte form, String unicodeVersion) { |
| this.form = form; |
| this.composition = (form & NF_COMPOSITION_MASK) != 0; |
| this.compatibility = (form & NF_COMPATIBILITY_MASK) != 0; |
| this.data = getData(unicodeVersion); |
| } |
| |
| /** |
| * Create a normalizer for a given form. |
| */ |
| // public Normalizer(byte form) { |
| // this(form,""); |
| //} |
| |
| /** |
| * Return string name |
| */ |
| public static String getName(byte form) { |
| return UCD_Names.NF_NAME[form]; |
| } |
| |
| /** |
| * Return string name |
| */ |
| public String getName() { |
| return getName(form); |
| } |
| |
| /** |
| * Return string name |
| */ |
| public String getUCDVersion() { |
| return data.getUCDVersion(); |
| } |
| |
| /** |
| * Does compose? |
| */ |
| public boolean isComposition() { |
| return composition; |
| } |
| |
| /** |
| * Does compose? |
| */ |
| public boolean isCompatibility() { |
| return compatibility; |
| } |
| |
| /** |
| * Normalizes text according to the chosen form, |
| * replacing contents of the target buffer. |
| * @param source the original text, unnormalized |
| * @param target the resulting normalized text |
| */ |
| public StringBuffer normalize(String source, StringBuffer target) { |
| |
| // First decompose the source into target, |
| // then compose if the form requires. |
| |
| if (source.length() != 0) { |
| internalDecompose(source, target, true, compatibility); |
| if (composition) { |
| internalCompose(target); |
| } |
| } |
| return target; |
| } |
| |
| /** |
| * Normalizes text according to the chosen form, |
| * replacing contents of the target buffer. |
| * @param source the original text, unnormalized |
| * @param target the resulting normalized text |
| */ |
| public boolean isFCD(String source) { |
| if (source.length() == 0) return true; |
| StringBuffer noReorder = new StringBuffer(); |
| StringBuffer reorder = new StringBuffer(); |
| |
| internalDecompose(source, noReorder, false, false); |
| internalDecompose(source, reorder, true, false); |
| |
| return reorder.toString().equals(noReorder.toString()); |
| } |
| |
| /** |
| * Normalizes text according to the chosen form |
| * @param source the original text, unnormalized |
| * @return target the resulting normalized text |
| */ |
| public String normalize(String source) { |
| return normalize(source, new StringBuffer()).toString(); |
| } |
| |
| /** |
| * Normalizes text according to the chosen form |
| * @param newLocaleID the original text, unnormalized |
| * @return target the resulting normalized text |
| */ |
| public String normalize(int cp) { |
| return normalize(UTF16.valueOf(cp)); |
| } |
| |
| /** |
| private StringBuffer hasDecompositionBuffer = new StringBuffer(); |
| |
| public boolean hasDecomposition(int cp) { |
| hasDecompositionBuffer.setLength(0); |
| normalize(UTF16.valueOf(cp), hasDecompositionBuffer); |
| if (hasDecompositionBuffer.length() != 1) return true; |
| return cp != hasDecompositionBuffer.charAt(0); |
| } |
| */ |
| |
| /** |
| * Does a quick check to see if the string is in the current form. Checks canonical order and |
| * isAllowed(). |
| * @param newLocaleID source text |
| * @return YES, NO, MAYBE |
| */ |
| /* |
| public static final int NO = 0, YES = 1, MAYBE = -1; |
| |
| public int quickCheck(String source) { |
| short lastCanonicalClass = 0; |
| int result = YES; |
| for (int i = 0; i < source.length(); ++i) { |
| char ch = source.charAt(i); |
| short canonicalClass = data.getCanonicalClass(ch); |
| if (lastCanonicalClass > canonicalClass && canonicalClass != 0) { |
| return NO; |
| } |
| int check = isAllowed(ch); |
| if (check == NO) return NO; |
| if (check == MAYBE) result = MAYBE; |
| } |
| return result; |
| } |
| |
| /** |
| * Find whether the given character is allowed in the current form. |
| * @return YES, NO, MAYBE |
| */ |
| /* |
| public int isAllowed(char ch) { |
| if (composition) { |
| if (compatibility) { |
| if (data.isCompatibilityExcluded(ch)) { |
| return NO; |
| } |
| } else { |
| if (data.isExcluded(ch)) { |
| return NO; |
| } |
| } |
| if (data.isTrailing(ch)) { |
| return MAYBE; |
| } |
| } else { // decomposition: both NFD and NFKD |
| if (data.normalizationDiffers(compatibility,ch)) return NO; |
| } |
| return YES; |
| } |
| |
| /** |
| * Utility: Gets the combining class of a character from the |
| * Unicode Character Database. Only a byte is needed, but since they are signed in Java |
| * return an int to forstall problems. |
| * @param ch the source character |
| * @return value from 0 to 255 |
| */ |
| |
| public short getCanonicalClass(int ch) { |
| return data.getCanonicalClass(ch); |
| } |
| |
| /** |
| * Utility: Checks whether there is a recursive decomposition of a character from the |
| * Unicode Character Database. It is compatibility or canonical according to the particular |
| * normalizer. |
| * @param ch the source character |
| */ |
| public boolean isNormalized(int ch) { |
| return !data.normalizationDiffers(ch, composition, compatibility); |
| } |
| |
| /** |
| * Utility: Checks whether there is a recursive decomposition of a character from the |
| * Unicode Character Database. It is compatibility or canonical according to the particular |
| * normalizer. |
| * @param ch the source character |
| */ |
| public boolean isNormalized(String s) { |
| if (UTF16.countCodePoint(s) > 1) { |
| return !data.normalizationDiffers(UTF16.charAt(s,0), composition, compatibility); |
| } |
| return s.equals(normalize(s)); // TODO: OPTIMIZE LATER |
| } |
| |
| /** |
| * Utility: Gets recursive decomposition of a character from the |
| * Unicode Character Database. |
| * @param compatibility If false selects the recursive |
| * canonical decomposition, otherwise selects |
| * the recursive compatibility AND canonical decomposition. |
| * @param ch the source character |
| * @param buffer buffer to be filled with the decomposition |
| */ |
| public void getRecursiveDecomposition(char ch, StringBuffer buffer) { |
| data.getRecursiveDecomposition(ch, buffer, compatibility); |
| } |
| |
| /** |
| * Utility: Gets composition mapping. |
| * @return IntEnumeration with the pair -> value mapping, where the |
| * pair is firstChar << 16 | secondChar. |
| * Will need to be fixed for surrogates. |
| */ |
| |
| public void getCompositionStatus(BitSet leading, BitSet trailing, BitSet resulting) { |
| Iterator it = data.compTable.keySet().iterator(); |
| while (it.hasNext()) { |
| Long key = (Long)it.next(); |
| Integer result = (Integer)data.compTable.get(key); |
| long keyLong = key.longValue(); |
| if (leading != null) leading.set((int)(keyLong >>> 32)); |
| if (trailing != null) trailing.set((int)keyLong); |
| if (resulting != null) resulting.set(result.intValue()); |
| } |
| for (int i = UCD.LBase; i < UCD.TLimit; ++i) { |
| if (leading != null && UCD.isLeadingJamo(i)) leading.set(i); // set all initial Jamo (that form syllables) |
| if (trailing != null && UCD.isNonLeadJamo(i)) trailing.set(i); // set all final Jamo (that form syllables) |
| } |
| if (leading != null) { |
| for (int i = UCD.SBase; i < UCD.SLimit; ++i) { |
| if (UCD.isDoubleHangul(i)) leading.set(i); // set all two-Jamo syllables |
| } |
| } |
| } |
| |
| public boolean isTrailing(int cp) { |
| return this.composition ? data.isTrailing(cp) : false; |
| } |
| |
| public boolean isLeading(int cp) { |
| return this.composition ? data.isLeading(cp) : false; |
| } |
| |
| public int getComposition(int first, int second) { |
| return data.getPairwiseComposition(first, second); |
| } |
| |
| // ====================================== |
| // PRIVATES |
| // ====================================== |
| |
| /** |
| * The current form. |
| */ |
| private byte form; |
| private boolean composition; |
| private boolean compatibility; |
| private UnicodeMap substituteMapping; |
| |
| /** |
| * Decomposes text, either canonical or compatibility, |
| * replacing contents of the target buffer. |
| * @param form the normalization form. If NF_COMPATIBILITY_MASK |
| * bit is on in this byte, then selects the recursive |
| * compatibility decomposition, otherwise selects |
| * the recursive canonical decomposition. |
| * @param source the original text, unnormalized |
| * @param target the resulting normalized text |
| */ |
| private void internalDecompose(String source, StringBuffer target, boolean reorder, boolean compat) { |
| StringBuffer buffer = new StringBuffer(); |
| int ch32; |
| for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) { |
| buffer.setLength(0); |
| ch32 = UTF16.charAt(source, i); |
| String sub = substituteMapping == null ? null : (String) substituteMapping.getValue(ch32); |
| if (sub != null) { |
| buffer.append(sub); |
| } else { |
| data.getRecursiveDecomposition(ch32, buffer, compat); |
| } |
| |
| // add all of the characters in the decomposition. |
| // (may be just the original character, if there was |
| // no decomposition mapping) |
| |
| int ch; |
| for (int j = 0; j < buffer.length(); j += UTF16.getCharCount(ch)) { |
| ch = UTF16.charAt(buffer, j); |
| int chClass = data.getCanonicalClass(ch); |
| int k = target.length(); // insertion point |
| if (chClass != 0 && reorder) { |
| |
| // bubble-sort combining marks as necessary |
| |
| int ch2; |
| for (; k > 0; k -= UTF16.getCharCount(ch2)) { |
| ch2 = UTF16.charAt(target, k-1); |
| if (data.getCanonicalClass(ch2) <= chClass) break; |
| } |
| } |
| target.insert(k, UTF16.valueOf(ch)); |
| } |
| } |
| } |
| |
| /** |
| * Composes text in place. Target must already |
| * have been decomposed. |
| * Uses UTF16, which is a utility class for supplementary character support in Java. |
| * @param target input: decomposed text. |
| * output: the resulting normalized text. |
| */ |
| private void internalCompose(StringBuffer target) { |
| int starterPos = 0; |
| int starterCh = UTF16.charAt(target,0); |
| int compPos = UTF16.getCharCount(starterCh); // length of last composition |
| int lastClass = data.getCanonicalClass(starterCh); |
| if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark |
| int oldLen = target.length(); |
| |
| // Loop on the decomposed characters, combining where possible |
| |
| int ch; |
| for (int decompPos = compPos; decompPos < target.length(); decompPos += UTF16.getCharCount(ch)) { |
| ch = UTF16.charAt(target, decompPos); |
| if (SHOW_PROGRESS) System.out.println(Utility.hex(target) |
| + ", decompPos: " + decompPos |
| + ", compPos: " + compPos |
| + ", ch: " + Utility.hex(ch) |
| ); |
| int chClass = data.getCanonicalClass(ch); |
| int composite = data.getPairwiseComposition(starterCh, ch); |
| if (composite != data.NOT_COMPOSITE |
| && (lastClass < chClass || lastClass == 0)) { |
| UTF16.setCharAt(target, starterPos, composite); |
| // we know that we will only be replacing non-supplementaries by non-supplementaries |
| // so we don't have to adjust the decompPos |
| starterCh = composite; |
| } else { |
| if (chClass == 0) { |
| starterPos = compPos; |
| starterCh = ch; |
| } |
| lastClass = chClass; |
| UTF16.setCharAt(target, compPos, ch); |
| if (target.length() != oldLen) { // MAY HAVE TO ADJUST! |
| System.out.println("ADJUSTING: " + Utility.hex(target)); |
| decompPos += target.length() - oldLen; |
| oldLen = target.length(); |
| } |
| compPos += UTF16.getCharCount(ch); |
| } |
| } |
| target.setLength(compPos); |
| } |
| |
| static class Stub { |
| private UCD ucd; |
| private HashMap compTable = new HashMap(); |
| private BitSet isSecond = new BitSet(); |
| private BitSet isFirst = new BitSet(); |
| private BitSet canonicalRecompose = new BitSet(); |
| private BitSet compatibilityRecompose = new BitSet(); |
| static final int NOT_COMPOSITE = 0xFFFF; |
| |
| Stub(String version) { |
| ucd = UCD.make(version); |
| for (int i = 0; i < 0x10FFFF; ++i) { |
| if (!ucd.isAssigned(i)) continue; |
| if (ucd.isPUA(i)) continue; |
| if (ucd.isNonLeadJamo(i)) isSecond.set(i); |
| if (ucd.isLeadingJamoComposition(i)) isFirst.set(i); |
| byte dt = ucd.getDecompositionType(i); |
| if (dt != CANONICAL) continue; |
| if (!ucd.getBinaryProperty(i, CompositionExclusion)) { |
| try { |
| String s = ucd.getDecompositionMapping(i); |
| int len = UTF16.countCodePoint(s); |
| if (len != 2) { |
| if (len > 2) { |
| if (ucd.getVersion().compareTo("3.0.0") >= 0) { |
| throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i)); |
| } |
| } |
| continue; |
| } |
| int a = UTF16.charAt(s, 0); |
| if (ucd.getCombiningClass(a) != 0) continue; |
| isFirst.set(a); |
| |
| int b = UTF16.charAt(s, UTF16.getCharCount(a)); |
| isSecond.set(b); |
| |
| // have a recomposition, so set the bit |
| canonicalRecompose.set(i); |
| |
| // set the compatibility recomposition bit |
| // ONLY if the component characters |
| // don't compatibility decompose |
| if (ucd.getDecompositionType(a) <= CANONICAL |
| && ucd.getDecompositionType(b) <= CANONICAL) { |
| compatibilityRecompose.set(i); |
| } |
| |
| long key = (((long)a)<<32) | b; |
| |
| /*if (i == '\u1E0A' || key == 0x004400000307) { |
| System.out.println(Utility.hex(s)); |
| System.out.println(Utility.hex(i)); |
| System.out.println(Utility.hex(key)); |
| }*/ |
| compTable.put(new Long(key), new Integer(i)); |
| } catch (Exception e) { |
| throw new ChainException("Error: {0}", new Object[]{ucd.toString(i)}, e); |
| } |
| } |
| } |
| // process compatibilityRecompose |
| // have to do this afterwards, since we don't know whether the pieces |
| // are allowable until we have processed all the characters |
| /* |
| Iterator it = compTable.keySet().iterator(); |
| while (it.hasNext()) { |
| Long key = (Long)it.next(); |
| int cp = compTable.get(key); |
| long keyLong = key.longValue(); |
| int first = (int)(keyLong >>> 32); |
| int second = (int)keyLong; |
| if (ucd. |
| */ |
| } |
| |
| String getUCDVersion() { |
| return ucd.getVersion(); |
| } |
| |
| /* |
| Problem: differs: true, call: false U+0385 GREEK DIALYTIKA TONOS |
| Problem: differs: true, call: false U+03D3 GREEK UPSILON WITH ACUTE AND HOOK SYMBOL |
| Problem: differs: true, call: false U+03D4 GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL |
| Problem: differs: true, call: false U+1E9B LATIN SMALL LETTER LONG S WITH DOT ABOVE |
| Problem: differs: true, call: false U+1FC1 GREEK DIALYTIKA AND PERISPOMENI |
| Problem: differs: true, call: false U+1FCD GREEK PSILI AND VARIA |
| Problem: differs: true, call: false U+1FCE GREEK PSILI AND OXIA |
| Problem: differs: true, call: false U+1FCF GREEK PSILI AND PERISPOMENI |
| Problem: differs: true, call: false U+1FDD GREEK DASIA AND VARIA |
| Problem: differs: true, call: false U+1FDE GREEK DASIA AND OXIA |
| Problem: differs: true, call: false U+1FDF GREEK DASIA AND PERISPOMENI |
| Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA |
| */ |
| |
| short getCanonicalClass(int cp) { |
| return ucd.getCombiningClass(cp); |
| } |
| |
| boolean isTrailing(int cp) { |
| return isSecond.get(cp); |
| } |
| |
| boolean isLeading(int cp) { |
| return isFirst.get(cp); |
| } |
| |
| boolean normalizationDiffers(int cp, boolean composition, boolean compat) { |
| byte dt = ucd.getDecompositionType(cp); |
| if (!composition) { |
| if (compat) return dt >= CANONICAL; |
| else return dt == CANONICAL; |
| } else { |
| // almost the same, except that we add back in the characters |
| // that RECOMPOSE |
| if (compat) return dt >= CANONICAL && !compatibilityRecompose.get(cp); |
| else return dt == CANONICAL && !canonicalRecompose.get(cp); |
| } |
| } |
| |
| public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compat) { |
| byte dt = ucd.getDecompositionType(cp); |
| // we know we decompose all CANONICAL, plus > CANONICAL if compat is TRUE. |
| if (dt == CANONICAL || dt > CANONICAL && compat) { |
| String s = ucd.getDecompositionMapping(cp); |
| if (s.equals(UTF16.valueOf(cp))) { |
| System.out.println("fix"); |
| } |
| for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { |
| cp = UTF16.charAt(s, i); |
| getRecursiveDecomposition(cp, buffer, compat); |
| } |
| } else { |
| UTF16.append(buffer, cp); |
| } |
| } |
| |
| int getPairwiseComposition(int starterCh, int ch) { |
| int hangulPoss = UCD.composeHangul(starterCh, ch); |
| if (hangulPoss != 0xFFFF) return hangulPoss; |
| Object obj = compTable.get(new Long((((long)starterCh)<<32) | ch)); |
| if (obj == null) return 0xFFFF; |
| return ((Integer)obj).intValue(); |
| } |
| |
| } |
| |
| /** |
| * Contains normalization data from the Unicode Character Database. |
| * use false for the minimal set, true for the real set. |
| */ |
| private Stub data; |
| |
| private static HashMap versionCache = new HashMap(); |
| |
| private static Stub getData (String version) { |
| if (version.length() == 0) version = UCD.latestVersion; |
| Stub result = (Stub)versionCache.get(version); |
| if (result == null) { |
| result = new Stub(version); |
| versionCache.put(version, result); |
| } |
| return result; |
| } |
| |
| public UnicodeMap getSubstituteMapping() { |
| return substituteMapping; |
| } |
| |
| public Normalizer setSubstituteMapping(UnicodeMap substituteMapping) { |
| this.substituteMapping = substituteMapping; |
| return this; |
| } |
| |
| static UnicodeMap spacingMap;; |
| public void setSpacingSubstitute() { |
| if (spacingMap == null) { |
| makeSpacingMap(); |
| } |
| setSubstituteMapping(spacingMap); |
| } |
| |
| private void makeSpacingMap() { |
| spacingMap = new UnicodeMap(); |
| StringBuffer b = new StringBuffer(); |
| main: |
| for (int i = 0; i <= 0x10FFFF; ++i) { |
| boolean compat = data.ucd.getDecompositionType(i) >= data.ucd.CANONICAL; |
| if (!compat) continue; |
| b.setLength(0); |
| data.getRecursiveDecomposition(i, b, true); |
| if (b.length() == 1) continue; |
| char firstChar = b.charAt(0); |
| if (firstChar != 0x20 && firstChar != '\u0640') continue; |
| // if rest are just Mn or Me marks, then add to substitute mapping |
| int cp; |
| for (int j = 1; j < b.length(); j += UTF16.getCharCount(cp)) { |
| cp = UTF16.charAt(b,j); |
| int cat = data.ucd.getCategory(cp); |
| if (cat != data.ucd.Mn && cat != data.ucd.Me) continue main; |
| } |
| spacingMap.put(i, UTF16.valueOf(i)); |
| } |
| String[][] specials = { |
| {"[\\u0384\\u1FFD]", "\u00B4"}, |
| {"[\\uFFE3]", "\u00AF"}, |
| {"[\\uFE49-\\uFE4C]", "\u203E"}, |
| {"[\\u1FED]", "\u00A8\u0300"}, |
| {"[\\u1FEE\\u0385]", "\u00A8\u0301"}, |
| {"[\\u1FC1]", "\u00A8\u0342"}, |
| {"[\\u1FBD]", "\u1FBF"}, |
| {"[\\u1FCD]", "\u1FBF\u0300"}, |
| {"[\\u1FCE]", "\u1FBF\u0301"}, |
| {"[\\u1FCF]", "\u1FBF\u0342"}, |
| {"[\\u1FDD]", "\u1FFE\u0300"}, |
| {"[\\u1FDE]", "\u1FFE\u0301"}, |
| {"[\\u1FDF]", "\u1FFE\u0342"}, |
| {"[\\uFC5E]", "\uFE72\u0651"}, |
| {"[\\uFC5F]", "\uFE74\u0651"}, |
| {"[\\uFC60]", "\uFE76\u0651"}, |
| {"[\\uFC61]", "\uFE78\u0651"}, |
| {"[\\uFC62]", "\uFE7A\u0651"}, |
| {"[\\uFC63]", "\uFE7C\u0670"}, |
| {"[\\uFCF2]", "\uFE77\u0651"}, |
| {"[\\uFCF3]", "\uFE79\u0651"}, |
| {"[\\uFCF4]", "\uFE7B\u0651"}, |
| }; |
| int count = 0; |
| UnicodeSet mappedChars = spacingMap.keySet(); |
| for (int i = 0; i < specials.length; ++i) { |
| UnicodeSet source = new UnicodeSet(specials[i][0]); |
| if (!mappedChars.containsAll(source)) { |
| throw new InternalError("Remapping character that doesn't need it!" + source); |
| } |
| spacingMap.putAll(source, specials[i][1]); |
| count += source.size(); |
| } |
| spacingMap.freeze(); |
| } |
| |
| /** |
| * Just accessible for testing. |
| */ |
| /* |
| boolean isExcluded (char ch) { |
| return data.isExcluded(ch); |
| } |
| |
| /** |
| * Just accessible for testing. |
| */ |
| /* |
| String getRawDecompositionMapping (char ch) { |
| return data.getRawDecompositionMapping(ch); |
| } |
| //*/ |
| } |