| package com.ibm.icu.dev.test.normalizer; |
| |
| import java.util.BitSet; |
| |
| import com.ibm.icu.dev.test.UTF16Util; |
| |
| /** |
| * Accesses the Normalization Data used for Forms C and D.<br> |
| * Copyright (C) 1998-2004 International Business Machines Corporation and |
| * Unicode, Inc. All Rights Reserved.<br> |
| * The Unicode Consortium makes no expressed or implied warranty of any |
| * kind, and assumes no liability for errors or omissions. |
| * No liability is assumed for incidental and consequential damages |
| * in connection with or arising out of the use of the information here. |
| * @author Mark Davis |
| * Updates for supplementary code points: |
| * Vladimir Weinstein & Markus Scherer |
| */ |
| public class NormalizerData { |
| static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc."; |
| |
| /** |
| * Constant for use in getPairwiseComposition |
| */ |
| public static final int NOT_COMPOSITE = '\uFFFF'; |
| |
| /** |
| * Gets the combining class of a character from the |
| * Unicode Character Database. |
| * @param ch the source character |
| * @return value from 0 to 255 |
| */ |
| public int getCanonicalClass(int ch) { |
| return canonicalClass.get(ch); |
| } |
| |
| /** |
| * Returns the composite of the two characters. If the two |
| * characters don't combine, returns NOT_COMPOSITE. |
| * @param first first character (e.g. 'c') |
| * @param second second character (e.g. \u0327 cedilla) |
| * @return composite (e.g. \u00C7 c cedilla) |
| */ |
| public int getPairwiseComposition(int first, int second) { |
| return compose.get(((long)first << 32) | second); |
| } |
| |
| |
| /** |
| * Gets recursive decomposition of a character from the |
| * Unicode Character Database. |
| * @param canonical If true |
| * bit is on in this byte, then selects the recursive |
| * canonical decomposition, otherwise selects |
| * the recursive compatibility and canonical decomposition. |
| * @param ch the source character |
| * @param buffer buffer to be filled with the decomposition |
| */ |
| public void getRecursiveDecomposition(boolean canonical, int ch, StringBuffer buffer) { |
| String decomp = decompose.get(ch); |
| if (decomp != null && !(canonical && isCompatibility.get(ch))) { |
| for (int i = 0; i < decomp.length(); i+=UTF16Util.codePointLength(ch)) { |
| ch = UTF16Util.nextCodePoint(decomp, i); |
| getRecursiveDecomposition(canonical, ch, buffer); |
| } |
| } else { // if no decomp, append |
| UTF16Util.appendCodePoint(buffer, ch); |
| } |
| } |
| |
| // ================================================= |
| // PRIVATES |
| // ================================================= |
| |
| /** |
| * Only accessed by NormalizerBuilder. |
| */ |
| NormalizerData(IntHashtable canonicalClass, IntStringHashtable decompose, |
| LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) { |
| this.canonicalClass = canonicalClass; |
| this.decompose = decompose; |
| this.compose = compose; |
| this.isCompatibility = isCompatibility; |
| this.isExcluded = isExcluded; |
| } |
| |
| /** |
| * Just accessible for testing. |
| */ |
| boolean getExcluded (char ch) { |
| return isExcluded.get(ch); |
| } |
| |
| /** |
| * Just accessible for testing. |
| */ |
| String getRawDecompositionMapping (char ch) { |
| return decompose.get(ch); |
| } |
| |
| /** |
| * For now, just use IntHashtable |
| * Two-stage tables would be used in an optimized implementation. |
| */ |
| private IntHashtable canonicalClass; |
| |
| /** |
| * The main data table maps chars to a 32-bit int. |
| * It holds either a pair: top = first, bottom = second |
| * or singleton: top = 0, bottom = single. |
| * If there is no decomposition, the value is 0. |
| * Two-stage tables would be used in an optimized implementation. |
| * An optimization could also map chars to a small index, then use that |
| * index in a small array of ints. |
| */ |
| private IntStringHashtable decompose; |
| |
| /** |
| * Maps from pairs of characters to single. |
| * If there is no decomposition, the value is NOT_COMPOSITE. |
| */ |
| private LongHashtable compose; |
| |
| /** |
| * Tells whether decomposition is canonical or not. |
| */ |
| private BitSet isCompatibility = new BitSet(); |
| |
| /** |
| * Tells whether character is script-excluded or not. |
| * Used only while building, and for testing. |
| */ |
| |
| private BitSet isExcluded = new BitSet(); |
| } |