blob: d18288bd5e6e54379f86716629adac889ff15b2e [file] [log] [blame]
package com.ibm.icu.dev.test.normalizer;
import java.util.BitSet;
import com.ibm.icu.dev.test.UTF16Util;
/**
* Accesses the Normalization Data used for Forms C and D.<br>
* Copyright (C) 1998-2004 International Business Machines Corporation and
* Unicode, Inc. All Rights Reserved.<br>
* The Unicode Consortium makes no expressed or implied warranty of any
* kind, and assumes no liability for errors or omissions.
* No liability is assumed for incidental and consequential damages
* in connection with or arising out of the use of the information here.
* @author Mark Davis
* Updates for supplementary code points:
* Vladimir Weinstein & Markus Scherer
*/
public class NormalizerData {
static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
/**
* Constant for use in getPairwiseComposition
*/
public static final int NOT_COMPOSITE = '\uFFFF';
/**
* Gets the combining class of a character from the
* Unicode Character Database.
* @param ch the source character
* @return value from 0 to 255
*/
public int getCanonicalClass(int ch) {
return canonicalClass.get(ch);
}
/**
* Returns the composite of the two characters. If the two
* characters don't combine, returns NOT_COMPOSITE.
* @param first first character (e.g. 'c')
* @param second second character (e.g. \u0327 cedilla)
* @return composite (e.g. \u00C7 c cedilla)
*/
public int getPairwiseComposition(int first, int second) {
return compose.get(((long)first << 32) | second);
}
/**
* Gets recursive decomposition of a character from the
* Unicode Character Database.
* @param canonical If true
* bit is on in this byte, then selects the recursive
* canonical decomposition, otherwise selects
* the recursive compatibility and canonical decomposition.
* @param ch the source character
* @param buffer buffer to be filled with the decomposition
*/
public void getRecursiveDecomposition(boolean canonical, int ch, StringBuffer buffer) {
String decomp = decompose.get(ch);
if (decomp != null && !(canonical && isCompatibility.get(ch))) {
for (int i = 0; i < decomp.length(); i+=UTF16Util.codePointLength(ch)) {
ch = UTF16Util.nextCodePoint(decomp, i);
getRecursiveDecomposition(canonical, ch, buffer);
}
} else { // if no decomp, append
UTF16Util.appendCodePoint(buffer, ch);
}
}
// =================================================
// PRIVATES
// =================================================
/**
* Only accessed by NormalizerBuilder.
*/
NormalizerData(IntHashtable canonicalClass, IntStringHashtable decompose,
LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) {
this.canonicalClass = canonicalClass;
this.decompose = decompose;
this.compose = compose;
this.isCompatibility = isCompatibility;
this.isExcluded = isExcluded;
}
/**
* Just accessible for testing.
*/
boolean getExcluded (char ch) {
return isExcluded.get(ch);
}
/**
* Just accessible for testing.
*/
String getRawDecompositionMapping (char ch) {
return decompose.get(ch);
}
/**
* For now, just use IntHashtable
* Two-stage tables would be used in an optimized implementation.
*/
private IntHashtable canonicalClass;
/**
* The main data table maps chars to a 32-bit int.
* It holds either a pair: top = first, bottom = second
* or singleton: top = 0, bottom = single.
* If there is no decomposition, the value is 0.
* Two-stage tables would be used in an optimized implementation.
* An optimization could also map chars to a small index, then use that
* index in a small array of ints.
*/
private IntStringHashtable decompose;
/**
* Maps from pairs of characters to single.
* If there is no decomposition, the value is NOT_COMPOSITE.
*/
private LongHashtable compose;
/**
* Tells whether decomposition is canonical or not.
*/
private BitSet isCompatibility = new BitSet();
/**
* Tells whether character is script-excluded or not.
* Used only while building, and for testing.
*/
private BitSet isExcluded = new BitSet();
}