blob: 4cf37d960d61f851891b45a72bcabfa60e829e3b [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/Normalizer.java,v $
* $Date: 2001/11/21 00:56:22 $
* $Revision: 1.13 $
*
*****************************************************************************************
*/
package com.ibm.text;
import java.lang.Character;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import com.ibm.util.CompactByteArray;
import com.ibm.util.Utility;
/**
* <tt>Normalizer</tt> transforms Unicode text into an equivalent composed or
* decomposed form, allowing for easier sorting and searching of text.
* <tt>Normalizer</tt> supports the standard normalization forms described in
* <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
* Unicode Technical Report #15</a>.
* <p>
* Characters with accents or other adornments can be encoded in
* several different ways in Unicode. For example, take the character "Â"
* (A-acute). In Unicode, this can be encoded as a single character (the
* "composed" form):
* <pre>
* 00C1 LATIN CAPITAL LETTER A WITH ACUTE</pre>
* or as two separate characters (the "decomposed" form):
* <pre>
* 0041 LATIN CAPITAL LETTER A
* 0301 COMBINING ACUTE ACCENT</pre>
* <p>
* To a user of your program, however, both of these sequences should be
* treated as the same "user-level" character "Â". When you are searching or
* comparing text, you must ensure that these two sequences are treated
* equivalently. In addition, you must handle characters with more than one
* accent. Sometimes the order of a character's combining accents is
* significant, while in other cases accent sequences in different orders are
* really equivalent.
* <p>
* Similarly, the string "ffi" can be encoded as three separate letters:
* <pre>
* 0066 LATIN SMALL LETTER F
* 0066 LATIN SMALL LETTER F
* 0069 LATIN SMALL LETTER I</pre>
* or as the single character
* <pre>
* FB03 LATIN SMALL LIGATURE FFI</pre>
* <p>
* The ffi ligature is not a distinct semantic character, and strictly speaking
* it shouldn't be in Unicode at all, but it was included for compatibility
* with existing character sets that already provided it. The Unicode standard
* identifies such characters by giving them "compatibility" decompositions
* into the corresponding semantic characters. When sorting and searching, you
* will often want to use these mappings.
* <p>
* <tt>Normalizer</tt> helps solve these problems by transforming text into the
* canonical composed and decomposed forms as shown in the first example above.
* In addition, you can have it perform compatibility decompositions so that
* you can treat compatibility characters the same as their equivalents.
* Finally, <tt>Normalizer</tt> rearranges accents into the proper canonical
* order, so that you do not have to worry about accent rearrangement on your
* own.
* <p>
* <tt>Normalizer</tt> adds one optional behavior, {@link #IGNORE_HANGUL},
* that differs from
* the standard Unicode Normalization Forms. This option can be passed
* to the {@link #Normalizer constructors} and to the static
* {@link #compose compose} and {@link #decompose decompose} methods. This
* option, and any that are added in the future, will be turned off by default.
* <p>
* There are three common usage models for <tt>Normalizer</tt>. In the first,
* the static {@link #normalize normalize()} method is used to process an
* entire input string at once. Second, you can create a <tt>Normalizer</tt>
* object and use it to iterate through the normalized form of a string by
* calling {@link #first} and {@link #next}. Finally, you can use the
* {@link #setIndex setIndex()} and {@link #getIndex} methods to perform
* random-access iteration, which is very useful for searching.
* <p>
* <b>Note:</b> <tt>Normalizer</tt> objects behave like iterators and have
* methods such as <tt>setIndex</tt>, <tt>next</tt>, <tt>previous</tt>, etc.
* You should note that while the <tt>setIndex</tt> and <tt>getIndex</tt> refer
* to indices in the underlying <em>input</em> text being processed, the
* <tt>next</tt> and <tt>previous</tt> methods it iterate through characters
* in the normalized <em>output</em>. This means that there is not
* necessarily a one-to-one correspondence between characters returned
* by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
* returned from <tt>setIndex</tt> and <tt>getIndex</tt>. It is for this
* reason that <tt>Normalizer</tt> does not implement the
* {@link CharacterIterator} interface.
* <p>
* <b>Note:</b> <tt>Normalizer</tt> is currently based on version 2.1.8
* of the <a href="http://www.unicode.org" target="unicode">Unicode Standard</a>.
* It will be updated as later versions of Unicode are released. If you are
* using this class on a JDK that supports an earlier version of Unicode, it
* is possible that <tt>Normalizer</tt> may generate composed or dedecomposed
* characters for which your JDK's {@link java.lang.Character} class does not
* have any data.
* <p>
* @author Laura Werner, Mark Davis
*/
public final class Normalizer {
/**
* Constant indicating that the end of the iteration has been reached.
* This is guaranteed to have the same value as {@link CharacterIterator#DONE}.
*/
public static final char DONE = CharacterIterator.DONE;
// This tells us what the bits in the "mode" object mean.
private static final int COMPAT_BIT = 1;
private static final int DECOMP_BIT = 2;
private static final int COMPOSE_BIT = 4;
/**
* This class represents the mode of a {@link Normalizer}
* object, <i>i.e.</i> the Unicode Normalization Form of the
* text that the <tt>Normalizer</tt> produces. <tt>Mode</tt> objects
* are used as arguments to the {@link Normalizer#Normalizer constructors}
* and {@link Normalizer#setMode setMode} method of <tt>Normalizer</tt>.
* <p>
* Clients cannot create <tt>Mode</tt> objects directly.
* Instead, use the predefined constants {@link Normalizer#NO_OP},
* {@link Normalizer#COMPOSE}, {@link Normalizer#COMPOSE_COMPAT},
* {@link Normalizer#DECOMP}, and {@link Normalizer#DECOMP_COMPAT}.
* <p>
* @see Normalizer
*/
public static final class Mode {
Mode(int m) {
mode = m;
}
final boolean compat() {
return (mode & COMPAT_BIT) != 0;
}
final boolean compose() {
return (mode & COMPOSE_BIT) != 0;
}
final boolean decomp() {
return (mode & DECOMP_BIT) != 0;
}
final int mode;
};
/**
* Null operation for use with the {@link #Normalizer constructors}
* and the static {@link #normalize normalize} method. This value tells
* the <tt>Normalizer</tt> to do nothing but return unprocessed characters
* from the underlying String or CharacterIterator. If you have code which
* requires raw text at some times and normalized text at others, you can
* use <tt>NO_OP</tt> for the cases where you want raw text, rather
* than having a separate code path that bypasses <tt>Normalizer</tt>
* altogether.
* <p>
* @see #setMode
*/
public static final Mode NO_OP = new Mode(0);
/**
* Canonical decomposition followed by canonical composition. Used with the
* {@link #Normalizer constructors} and the static {@link #normalize normalize}
* method to determine the operation to be performed.
* <p>
* If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
* off, this operation produces output that is in
* <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical Form</a>
* <b>C</b>.
* <p>
* @see #setMode
*/
public static final Mode COMPOSE = new Mode(COMPOSE_BIT);
/**
* Compatibility decomposition followed by canonical composition.
* Used with the {@link #Normalizer constructors} and the static
* {@link #normalize normalize} method to determine the operation to be performed.
* <p>
* If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
* off, this operation produces output that is in
* <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical Form</a>
* <b>KC</b>.
* <p>
* @see #setMode
*/
public static final Mode COMPOSE_COMPAT = new Mode(COMPOSE_BIT | COMPAT_BIT);
/**
* Canonical decomposition. This value is passed to the
* {@link #Normalizer constructors} and the static {@link #normalize normalize}
* method to determine the operation to be performed.
* <p>
* If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
* off, this operation produces output that is in
* <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical Form</a>
* <b>D</b>.
* <p>
* @see #setMode
*/
public static final Mode DECOMP = new Mode(DECOMP_BIT);
/**
* Compatibility decomposition. This value is passed to the
* {@link #Normalizer constructors} and the static {@link #normalize normalize}
* method to determine the operation to be performed.
* <p>
* If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
* off, this operation produces output that is in
* <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical Form</a>
* <b>KD</b>.
* <p>
* @see #setMode
*/
public static final Mode DECOMP_COMPAT = new Mode(DECOMP_BIT | COMPAT_BIT);
/**
* Option to disable Hangul/Jamo composition and decomposition.
* This option applies to Korean text,
* which can be represented either in the Jamo alphabet or in Hangul
* characters, which are really just two or three Jamo combined
* into one visual glyph. Since Jamo takes up more storage space than
* Hangul, applications that process only Hangul text may wish to turn
* this option on when decomposing text.
* <p>
* The Unicode standard treates Hangul to Jamo conversion as a
* canonical decomposition, so this option must be turned <b>off</b> if you
* wish to transform strings into one of the standard
* <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
* Unicode Normalization Forms</a>.
* <p>
* @see #setOption
*/
public static final int IGNORE_HANGUL = 0x0001;
//-------------------------------------------------------------------------
// Constructors
//-------------------------------------------------------------------------
/**
* Creates a new <tt>Normalizer</tt> object for iterating over the
* normalized form of a given string.
* <p>
* @param str The string to be normalized. The normalization
* will start at the beginning of the string.
*
* @param mode The normalization mode.
*/
public Normalizer(String str, Mode mode) {
this(new StringCharacterIterator(str), mode, 0);
}
/**
* Creates a new <tt>Normalizer</tt> object for iterating over the
* normalized form of a given string.
* <p>
* The <tt>options</tt> parameter specifies which optional
* <tt>Normalizer</tt> features are to be enabled for this object.
* <p>
* @param str The string to be normalized. The normalization
* will start at the beginning of the string.
*
* @param mode The normalization mode.
*
* @param opt Any optional features to be enabled.
* Currently the only available option is {@link #IGNORE_HANGUL}.
* If you want the default behavior corresponding to one of the
* standard Unicode Normalization Forms, use 0 for this argument.
*/
public Normalizer(String str, Mode mode, int opt) {
this(new StringCharacterIterator(str), mode, opt);
}
/**
* Creates a new <tt>Normalizer</tt> object for iterating over the
* normalized form of the given text.
* <p>
* @param iter The input text to be normalized. The normalization
* will start at the beginning of the string.
*
* @param mode The normalization mode.
*
*/
public Normalizer(CharacterIterator iter, Mode mode) {
this(iter, mode, 0);
}
/**
* Creates a new <tt>Normalizer</tt> object for iterating over the
* normalized form of the given text.
* <p>
* @param iter The input text to be normalized. The normalization
* will start at the beginning of the string.
*
* @param mode The normalization mode.
*
* @param opt Any optional features to be enabled.
* Currently the only available option is {@link #IGNORE_HANGUL}.
* If you want the default behavior corresponding to one of the
* standard Unicode Normalization Forms, use 0 for this argument.
*/
public Normalizer(CharacterIterator iter, Mode mode, int opt) {
text = iter;
this.mode = mode;
options = opt;
// Compatibility explosions have lower indices; skip them if necessary
minDecomp = mode.compat() ? 0 : DecompData.MAX_COMPAT;
}
/**
* Clones this <tt>Normalizer</tt> object. All properties of this
* object are duplicated in the new object, including the cloning of any
* {@link CharacterIterator} that was passed in to the constructor
* or to {@link #setText(CharacterIterator) setText}.
* However, the text storage underlying
* the <tt>CharacterIterator</tt> is not duplicated unless the
* iterator's <tt>clone</tt> method does so.
*/
public Object clone() {
try {
Normalizer copy = (Normalizer) super.clone();
copy.text = (CharacterIterator) text.clone();
return copy;
}
catch (CloneNotSupportedException e) {
throw new InternalError(e.toString());
}
}
//-------------------------------------------------------------------------
// Static utility methods
//-------------------------------------------------------------------------
/**
* Normalizes a <tt>String</tt> using the given normalization operation.
* <p>
* The <tt>options</tt> parameter specifies which optional
* <tt>Normalizer</tt> features are to be enabled for this operation.
* Currently the only available option is {@link #IGNORE_HANGUL}.
* If you want the default behavior corresponding to one of the standard
* Unicode Normalization Forms, use 0 for this argument.
* <p>
* @param str the input string to be normalized.
*
* @param aMode the normalization mode
*
* @param options the optional features to be enabled.
*/
public static String normalize(String str, Mode mode, int options) {
if (mode.compose()) {
// compose() handles decomposition and reordering;
// don't call decompose() first.
return compose(str, mode.compat(), options);
}
if (mode.decomp()) {
return decompose(str, mode.compat(), options);
}
return str;
}
//-------------------------------------------------------------------------
// Compose methods
//-------------------------------------------------------------------------
/**
* Compose a <tt>String</tt>.
* <p>
* The <tt>options</tt> parameter specifies which optional
* <tt>Normalizer</tt> features are to be enabled for this operation.
* Currently the only available option is {@link #IGNORE_HANGUL}.
* If you want the default behavior corresponding
* to Unicode Normalization Form <b>C</b> or <b>KC</b>,
* use 0 for this argument.
* <p>
* @param source the string to be composed.
*
* @param compat Perform compatibility decomposition before composition.
* If this argument is <tt>false</tt>, only canonical
* decomposition will be performed.
*
* @param options the optional features to be enabled.
*
* @return the composed string.
*/
public static String compose(String source, boolean compat, int options)
{
StringBuffer result = new StringBuffer();
StringBuffer explodeBuf = new StringBuffer();
int explodePos = EMPTY; // Position in input buffer
int basePos = 0; // Position of last base in output string
int baseIndex = 0; // Index of last base in "actions" array
int classesSeenL = 0; // Combining classes seen since last base
int classesSeenH = 0; // 64-bit mask
int action;
// Compatibility explosions have lower indices; skip them if necessary
int minExplode = compat ? 0 : ComposeData.MAX_COMPAT;
int minDecomp = compat ? 0 : DecompData.MAX_COMPAT;
if (DEBUG) System.out.println("minExplode = " + minExplode);
int i = 0;
while (i < source.length() || explodePos != EMPTY) {
// Get the next char from either the buffer or the source
char ch;
if (explodePos == EMPTY) {
ch = source.charAt(i++);
} else {
ch = explodeBuf.charAt(explodePos++);
if (explodePos >= explodeBuf.length()) {
explodePos = EMPTY;
explodeBuf.setLength(0);
}
}
// Get the basic info for the character
int charInfo = composeLookup(ch);
int type = charInfo & ComposeData.TYPE_MASK;
int index = charInfo >>> ComposeData.INDEX_SHIFT;
if (DEBUG) System.out.println("Got char " + Utility.hex(ch) + ", type=" + type + ", index=" + index);
// Examples of NON_COMPOSING_COMBINING with an index < minExplode:
// 00A8 017F 03D2 1FBF 1FFE
if (type == ComposeData.BASE || (type == ComposeData.NON_COMPOSING_COMBINING && index < minExplode)) {
if (DEBUG) System.out.println("New base " + Utility.hex(ch) + ", type=" + type + ", index=" + index);
classesSeenL = classesSeenH = 0;
baseIndex = index;
basePos = result.length();
result.append(ch);
}
else if (type == ComposeData.COMBINING)
{
// assert(index > 0);
int cclass = ComposeData.typeBit[index];
// typeBit is a bit value from 0..63, indicating the class.
// We use a bit mask of 2 32-bit ints.
boolean seen = 0 != ((cclass < 32) ?
(classesSeenL & (1 << cclass)) :
(classesSeenH & (1 << (cclass & 31))));
if (DEBUG) System.out.println("Class of " + Utility.hex(ch) + " = " + cclass +
" seen:" + seen +
" baseIndex:" + baseIndex +
" action:" + composeAction(baseIndex, index));
// We can only combine a character with the base if we haven't
// already seen a combining character with the same canonical class.
// We only combine characters with an index from
// 1..COMBINING_COUNT-1. Indices >= COMBINING_COUNT are
// also combining characters, but we know that they don't
// compose with anything.
if (index < ComposeData.COMBINING_COUNT && !seen
&& (action = composeAction(baseIndex, index)) > 0)
{
if (action > ComposeData.MAX_COMPOSED) {
// Pairwise explosion. Actions above this value are really
// indices into an array that in turn contains indices
// into the exploding string table
// TODO: What if there are unprocessed chars in the explode buffer?
if (DEBUG) System.out.println("Pairwise exploding");
char newBase = pairExplode(explodeBuf, action);
explodePos = 0;
result.setCharAt(basePos, newBase);
baseIndex = composeLookup(newBase) >>> ComposeData.INDEX_SHIFT;
if (DEBUG) System.out.println("New base " + Utility.hex(newBase));
} else {
// Normal pairwise combination. Replace the base char
if (DEBUG) System.out.println("Pairwise combining");
char newBase = (char) action;
result.setCharAt(basePos, newBase);
baseIndex = composeLookup(newBase) >>> ComposeData.INDEX_SHIFT;
if (DEBUG) System.out.println("New base " + Utility.hex(newBase));
}
//
// Since there are Unicode characters that cannot be combined in arbitrary
// order, we have to re-process any combining marks that go with this
// base character. There are only four characters in Unicode that have
// this problem. If they are fixed in Unicode 3.0, this code can go away.
//
int len = result.length();
if (len - basePos > 1) {
for (int j = basePos+1; j < len; j++) {
explodeBuf.append(result.charAt(j));
}
result.setLength(basePos+1);
classesSeenL = classesSeenH = 0;
if (explodePos == EMPTY) explodePos = 0;
}
} else {
// No combination with this character
if (DEBUG) System.out.println("No action");
bubbleAppend(result, ch, cclass);
if (cclass < 32) {
classesSeenL |= 1 << cclass;
} else {
classesSeenH |= 1 << (cclass & 31);
}
}
}
else if (index > minExplode) {
// Single exploding character
explode(explodeBuf, index);
explodePos = 0;
if (DEBUG) System.out.println("explosion: " + Utility.hex(ch) + " --> " + Utility.hex(explodeBuf));
}
else if (type == ComposeData.HANGUL && minExplode == 0) {
// If we're in compatibility mode we need to decompose Hangul to Jamo,
// because some of the Jamo might have compatibility decompositions.
hangulToJamo(ch, explodeBuf, minDecomp);
if (DEBUG) System.out.println("decomposed hangul " + Utility.hex(ch) + " to jamo " + Utility.hex(explodeBuf));
explodePos = 0;
}
else if (type == ComposeData.INITIAL_JAMO) {
classesSeenL = classesSeenH = 0;
baseIndex = ComposeData.INITIAL_JAMO_INDEX;
basePos = result.length();
result.append(ch);
if (DEBUG) System.out.println("got initial jamo " + Utility.hex(ch));
}
else if (type == ComposeData.MEDIAL_JAMO && classesSeenL == 0 && classesSeenH == 0
&& baseIndex == ComposeData.INITIAL_JAMO_INDEX) {
// If the last character was an initial jamo, we can combine it with this
// one to create a Hangul character.
int l = result.charAt(basePos) - JAMO_LBASE;
int v = ch - JAMO_VBASE;
char newCh = (char)(HANGUL_BASE + (l*JAMO_VCOUNT + v) * JAMO_TCOUNT);
result.setCharAt(basePos, newCh);
if (DEBUG) System.out.println("got medial jamo " + Utility.hex(ch) + ", replacing with Hangul " + Utility.hex(newCh));
baseIndex = ComposeData.MEDIAL_JAMO_INDEX;
}
else if (type == ComposeData.FINAL_JAMO && classesSeenL == 0 && classesSeenH == 0
&& baseIndex == ComposeData.MEDIAL_JAMO_INDEX) {
// If the last character was a medial jamo that we turned into Hangul,
// we can add this character too.
char newCh = (char)(result.charAt(basePos) + (ch - JAMO_TBASE));
result.setCharAt(basePos, newCh);
if (DEBUG) System.out.println("got final jamo " + Utility.hex(ch) + ", replacing with Hangul " + Utility.hex(newCh));
baseIndex = 0;
basePos = -1;
classesSeenL = classesSeenH = 0;
} else {
if (DEBUG) System.out.println("No base as of " + Utility.hex(ch));
baseIndex = 0;
basePos = -1;
classesSeenL = classesSeenH = 0;
result.append(ch);
}
}
return result.toString();
}
/**
* Compose starting with current input character and continuing
* until just before the next base char.
* <p>
* <b>Input</b>:
* <ul>
* <li>underlying char iter points to first character to compose
* </ul>
* <p>
* <b>Output:</b>
* <ul>
* <li>returns first char of composition or DONE if at end
* <li>Underlying char iter is pointing at next base char or past end
* </ul>
*/
private char nextCompose()
{
if (DEBUG) System.out.println("--------------- top of nextCompose() ---------------");
int explodePos = EMPTY; // Position in input buffer
int basePos = 0; // Position of last base in output string
int baseIndex = 0; // Index of last base in "actions" array
int classesSeenL = 0; // Combining classes seen since last base
int classesSeenH = 0; // 64-bit mask
int action;
char lastBase = 0;
boolean chFromText = true;
// Compatibility explosions have lower indices; skip them if necessary
int minExplode = mode.compat() ? 0 : ComposeData.MAX_COMPAT;
int minDecomp = mode.compat() ? 0 : DecompData.MAX_COMPAT;
initBuffer();
if (explodeBuf == null) {
explodeBuf = new StringBuffer();
} else {
explodeBuf.setLength(0);
}
char ch = curForward();
while (ch != DONE) {
// Get the basic info for the character
int charInfo = composeLookup(ch);
int type = charInfo & ComposeData.TYPE_MASK;
int index = charInfo >>> ComposeData.INDEX_SHIFT;
if (type == ComposeData.BASE || (type == ComposeData.NON_COMPOSING_COMBINING && index < minExplode)) {
if (buffer.length() > 0 && chFromText && explodePos == EMPTY) {
// When we hit a base char in the source text, we can return the text
// that's been composed so far. We'll re-process this char next time through.
if (DEBUG) System.out.println("returning early because we hit a new base");
break;
}
classesSeenL = classesSeenH = 0;
baseIndex = index;
basePos = buffer.length();
buffer.append(ch);
if (DEBUG) System.out.println("got BASE char " + Utility.hex(ch) + ", type=" + type + ", index=" + index);
lastBase = ch;
}
else if (type == ComposeData.COMBINING)
{
// assert(index > 0);
int cclass = ComposeData.typeBit[index];
boolean seen = 0 != ((cclass < 32) ?
(classesSeenL & (1 << cclass)) :
(classesSeenH & (1 << (cclass & 31))));
if (DEBUG) System.out.println("got COMBINING char " + Utility.hex(ch) + ", type=" + type + ", index=" + index
+ ", class=" + cclass);
// We can only combine a character with the base if we haven't
// already seen a combining character with the same canonical class.
if (index < ComposeData.COMBINING_COUNT && !seen
&& (action = composeAction(baseIndex, index)) > 0)
{
if (action > ComposeData.MAX_COMPOSED) {
// Pairwise explosion. Actions above this value are really
// indices into an array that in turn contains indices
// into the exploding string table
// TODO: What if there are unprocessed chars in the explode buffer?
char newBase = pairExplode(explodeBuf, action);
explodePos = 0;
buffer.setCharAt(basePos, newBase);
baseIndex = composeLookup(newBase) >>> ComposeData.INDEX_SHIFT;
if (DEBUG) System.out.println("Pairwise explosion: " + Utility.hex(lastBase) + "," + Utility.hex(ch)
+ " --> " + Utility.hex(newBase) + "," + Utility.hex(explodeBuf));
lastBase = newBase;
} else {
// Normal pairwise combination. Replace the base char
char newBase = (char) action;
buffer.setCharAt(basePos, newBase);
baseIndex = composeLookup(newBase) >>> ComposeData.INDEX_SHIFT;
if (DEBUG) System.out.println("Pairwise combination: " + Utility.hex(lastBase) + "," + Utility.hex(ch)
+ " --> " + Utility.hex(newBase));
lastBase = newBase;
}
//
// Since there are Unicode characters that cannot be combined in arbitrary
// order, we have to re-process any combining marks that go with this
// base character. There are only four characters in Unicode that have
// this problem. If they are fixed in Unicode 3.0, this code can go away.
//
int len = buffer.length();
if (len - basePos > 1) {
if (DEBUG) System.out.println("Reprocessing combining marks");
for (int j = basePos+1; j < len; j++) {
explodeBuf.append(buffer.charAt(j));
}
buffer.setLength(basePos+1);
classesSeenL = classesSeenH = 0;
if (explodePos == EMPTY) explodePos = 0;
}
} else {
if (DEBUG) System.out.println("char doesn't combine");
// No combination with this character
bubbleAppend(buffer, ch, cclass);
if (cclass < 32) {
classesSeenL |= 1 << cclass;
} else {
classesSeenH |= 1 << (cclass & 31);
}
}
}
else if (index > minExplode) {
// Single exploding character
explode(explodeBuf, index);
explodePos = 0;
if (DEBUG) System.out.println("explosion: " + Utility.hex(ch) + " --> " + Utility.hex(explodeBuf));
}
else if (type == ComposeData.HANGUL && minExplode == 0) {
// If we're in compatibility mode we need to decompose Hangul to Jamo,
// because some of the Jamo might have compatibility decompositions.
hangulToJamo(ch, explodeBuf, minDecomp);
if (DEBUG) System.out.println("decomposed hangul " + Utility.hex(ch) + " to jamo " + Utility.hex(explodeBuf));
explodePos = 0;
}
else if (type == ComposeData.INITIAL_JAMO) {
if (buffer.length() > 0 && chFromText && explodePos == EMPTY) {
// When we hit a base char in the source text, we can return the text
// that's been composed so far. We'll re-process this char next time through.
if (DEBUG) System.out.println("returning early because we hit a new base");
break;
}
classesSeenL = classesSeenH = 0;
baseIndex = ComposeData.INITIAL_JAMO_INDEX;
basePos = buffer.length();
buffer.append(ch);
if (DEBUG) System.out.println("got initial jamo " + Utility.hex(ch));
}
else if (type == ComposeData.MEDIAL_JAMO && classesSeenL == 0 && classesSeenH == 0
&& baseIndex == ComposeData.INITIAL_JAMO_INDEX) {
// If the last character was an initial jamo, we can combine it with this
// one to create a Hangul character.
int l = buffer.charAt(basePos) - JAMO_LBASE;
int v = ch - JAMO_VBASE;
char newCh = (char)(HANGUL_BASE + (l*JAMO_VCOUNT + v) * JAMO_TCOUNT);
buffer.setCharAt(basePos, newCh);
if (DEBUG) System.out.println("got medial jamo " + Utility.hex(ch) + ", replacing with Hangul " + Utility.hex(newCh));
baseIndex = ComposeData.MEDIAL_JAMO_INDEX;
}
else if (type == ComposeData.FINAL_JAMO && classesSeenL == 0 && classesSeenH == 0
&& baseIndex == ComposeData.MEDIAL_JAMO_INDEX) {
// If the last character was a medial jamo that we turned into Hangul,
// we can add this character too.
char newCh = (char)(buffer.charAt(basePos) + (ch - JAMO_TBASE));
buffer.setCharAt(basePos, newCh);
if (DEBUG) System.out.println("got final jamo " + Utility.hex(ch) + ", replacing with Hangul " + Utility.hex(newCh));
baseIndex = 0;
basePos = -1;
classesSeenL = classesSeenH = 0;
} else {
// TODO: deal with JAMO character types
baseIndex = 0;
basePos = -1;
classesSeenL = classesSeenH = 0;
buffer.append(ch);
if (DEBUG) System.out.println("UNKNOWN char " + Utility.hex(ch));
}
if (explodePos == EMPTY) {
ch = text.next();
chFromText = true;
} else {
ch = explodeBuf.charAt(explodePos++);
if (explodePos >= explodeBuf.length()) {
explodePos = EMPTY;
explodeBuf.setLength(0);
}
chFromText = false;
}
}
if (buffer.length() > 0) {
bufferLimit = buffer.length() - 1;
ch = buffer.charAt(0);
} else {
ch = DONE;
bufferLimit = 0;
}
return ch;
}
/**
* Compose starting with the input char just before the current position
* and continuing backward until (and including) the previous base char.
* <p>
* <b>Input</b>:
* <ul>
* <li>underlying char iter points just after last char to decompose
* </ul>
* <p>
* <b>Output:</b>
* <ul>
* <li>returns last char of resulting decomposition sequence
* <li>underlying iter points to lowest-index char we decomposed, i.e. the base char
* </ul>
*/
private char prevCompose() {
if (DEBUG) System.out.println("--------------- top of prevCompose() ---------------");
// Compatibility explosions have lower indices; skip them if necessary
int minExplode = mode.compat() ? 0 : ComposeData.MAX_COMPAT;
initBuffer();
// Slurp up characters until we hit a base char or an initial Jamo
char ch;
while ((ch = curBackward()) != DONE) {
buffer.insert(0, ch);
// Get the basic info for the character
int charInfo = composeLookup(ch);
int type = charInfo & ComposeData.TYPE_MASK;
int index = charInfo >>> ComposeData.INDEX_SHIFT;
if (DEBUG) System.out.println("prevCompose got char " + Utility.hex(ch) +
", type=" + type + ", index=" + index +
", minExplode=" + minExplode);
if (type == ComposeData.BASE
|| (type == ComposeData.NON_COMPOSING_COMBINING && index < minExplode)
|| type == ComposeData.HANGUL
|| type == ComposeData.INITIAL_JAMO)
{
break;
}
}
// If there's more than one character in the buffer, compose it all at once....
if (buffer.length() > 0) {
// TODO: The performance of this is awful; add a way to compose
// a StringBuffer in place.
String composed = compose(buffer.toString(), mode.compat(), options);
if (DEBUG) System.out.println("prevCompose called compose(" + Utility.hex(buffer) +
")->" + Utility.hex(composed));
buffer.setLength(0);
buffer.append(composed);
if (buffer.length() > 1) {
bufferLimit = bufferPos = buffer.length() - 1;
ch = buffer.charAt(bufferPos);
} else {
ch = buffer.charAt(0);
}
}
else {
ch = DONE;
}
if (DEBUG) System.out.println("prevCompose returning " + Utility.hex(ch));
return ch;
}
private static void bubbleAppend(StringBuffer target, char ch, int cclass) {
if (DEBUG) System.out.println(" bubbleAppend(" + Utility.hex(target) + ", " + Utility.hex(ch) + ", " + cclass + ")" );
if (DEBUG) System.out.println(" getComposeClass(" + Utility.hex(ch) + ")=" + getComposeClass(ch) );
if (DEBUG) System.out.println(" target before bubbling is : " + Utility.hex(target));
int i = target.length()-1;
if (cclass != 1) { // 1 means combining class 0!!!
for (; i >= 0; --i ) {
int iClass = getComposeClass(target.charAt(i));
if (DEBUG) System.out.println(" getComposeClass(" + Utility.hex(target.charAt(i)) + ")=" + getComposeClass(target.charAt(i)) );
if (DEBUG) System.out.println(" bubbleAppend: target[" + i + "]=" + Utility.hex(target.charAt(i)) + " is iClass=" + iClass + " CC="+ UCharacter.getCombiningClass(target.charAt(i)));
if (DEBUG) System.out.println(" bubbleAppend: for ch="+ Utility.hex(ch) + " class="+cclass + " CC=" + UCharacter.getCombiningClass(ch));
if (iClass <= cclass) {
// We've hit something we can't bubble this character past, so insert here
break;
}
}
}
// We need to insert just after character "i"
if (DEBUG) System.out.println(" bubbleAppend inserting "+ Utility.hex(ch)+" at index " + (i+1));
target.insert(i+1, ch);
if (DEBUG) System.out.println(" target is : " + Utility.hex(target));
}
private static int getComposeClass(char ch) {
int cclass = 0;
int charInfo = composeLookup(ch);
int type = charInfo & ComposeData.TYPE_MASK;
if(DEBUG) System.out.println(Utility.hex(ch) + " charInfo: " +charInfo + " type : " +type);
if (type == ComposeData.COMBINING) {
cclass = ComposeData.typeBit[charInfo >>> ComposeData.INDEX_SHIFT];
}
return cclass;
}
static final int composeLookup(char ch) {
return ComposeData.lookup.elementAt(ch);
}
static final int composeAction(int baseIndex, int comIndex) {
return ComposeData.actions.elementAt((char)(baseIndex
+ ComposeData.MAX_BASES*comIndex));
}
static final void explode(StringBuffer target, int index) {
char ch;
while ((ch = ComposeData.replace.charAt(index++)) != 0)
target.append(ch);
}
static final char pairExplode(StringBuffer target, int action) {
int index = ComposeData.actionIndex[action - ComposeData.MAX_COMPOSED];
explode(target, index + 1);
return ComposeData.replace.charAt(index); // New base char
}
//-------------------------------------------------------------------------
// Decompose methods
//-------------------------------------------------------------------------
/**
* Static method to decompose a <tt>String</tt>.
* <p>
* The <tt>options</tt> parameter specifies which optional
* <tt>Normalizer</tt> features are to be enabled for this operation.
* Currently the only available option is {@link #IGNORE_HANGUL}.
* The desired options should be OR'ed together to determine the value
* of this argument. If you want the default behavior corresponding
* to Unicode Normalization Form <b>D</b> or <b>KD</b>,
* use 0 for this argument.
* <p>
* @param str the string to be decomposed.
*
* @param compat Perform compatibility decomposition.
* If this argument is <tt>false</tt>, only canonical
* decomposition will be performed.
*
*
* @return the decomposed string.
*/
public static String decompose(String source, boolean compat, int options)
{
if (DEBUG) System.out.println("--------------- top of decompose() ---------------");
boolean hangul = (options & IGNORE_HANGUL) == 0;
int minDecomp = compat ? 0 : DecompData.MAX_COMPAT;
StringBuffer result = new StringBuffer();
StringBuffer buffer = null;
int i = 0, bufPtr = -1;
while (i < source.length() || bufPtr >= 0)
{
char ch;
if (bufPtr >= 0) {
ch = buffer.charAt(bufPtr++);
if (bufPtr == buffer.length()) {
bufPtr = -1;
}
} else {
ch = source.charAt(i++);
}
int offset = DecompData.offsets.elementAt(ch);
int index = offset & DecompData.DECOMP_MASK;
if (DEBUG) System.out.println("decompose got " + Utility.hex(ch));
if (index > minDecomp) {
if ((offset & DecompData.DECOMP_RECURSE) != 0) {
if (DEBUG) System.out.println(" " + Utility.hex(ch) + " has RECURSIVE decomposition, index=" + index);
if (buffer == null) {
buffer = new StringBuffer();
} else {
buffer.setLength(0);
}
doAppend(DecompData.contents, index, buffer);
bufPtr = 0;
} else {
if (DEBUG) System.out.println(" " + Utility.hex(ch) + " has decomposition, index=" + index);
doAppend(DecompData.contents, index, result);
}
} else if (ch >= HANGUL_BASE && ch < HANGUL_LIMIT && hangul) {
hangulToJamo(ch, result, minDecomp);
} else {
result.append(ch);
}
}
fixCanonical(result);
return result.toString();
}
/**
* Decompose starting with current input character and continuing
* until just before the next base char.
* <p>
* <b>Input</b>:
* <ul>
* <li>underlying char iter points to first character to decompose
* </ul>
* <p>
* <b>Output:</b>
* <ul>
* <li>returns first char of decomposition or DONE if at end
* <li>Underlying char iter is pointing at next base char or past end
* </ul>
*/
private char nextDecomp()
{
if (DEBUG) System.out.println("--------------- top of nextDecomp() ---------------");
boolean hangul = (options & IGNORE_HANGUL) == 0;
char ch = curForward();
int offset = DecompData.offsets.elementAt(ch);
int index = offset & DecompData.DECOMP_MASK;
if (index > minDecomp || DecompData.canonClass.elementAt(ch) != DecompData.BASE)
{
initBuffer();
if (index > minDecomp) {
if (DEBUG) System.out.println(" " + Utility.hex(ch) + " has decomposition, index=" + index);
doAppend(DecompData.contents, index, buffer);
if ((offset & DecompData.DECOMP_RECURSE) != 0) {
// Need to decompose the output of this decomposition recursively.
for (int i = 0; i < buffer.length(); i++) {
ch = buffer.charAt(i);
index = DecompData.offsets.elementAt(ch) & DecompData.DECOMP_MASK;
if (index > minDecomp) {
i += doReplace(DecompData.contents, index, buffer, i);
}
}
}
} else {
buffer.append(ch);
}
boolean needToReorder = false;
// Any other combining chacters that immediately follow the decomposed
// character must be included in the buffer too, because they're
// conceptually part of the same logical character.
while ((ch = text.next()) != DONE
&& DecompData.canonClass.elementAt(ch) != DecompData.BASE)
{
needToReorder = true;
// Decompose any of these characters that need it - Liu
index = DecompData.offsets.elementAt(ch) & DecompData.DECOMP_MASK;
if (index > minDecomp) {
doAppend(DecompData.contents, index, buffer);
} else {
buffer.append(ch);
}
}
if (buffer.length() > 1 && needToReorder) {
// If there is more than one combining character in the buffer,
// put them into the canonical order.
// But we don't need to sort if only characters are the ones that
// resulted from decomosing the base character.
fixCanonical(buffer);
}
bufferLimit = buffer.length() - 1;
ch = buffer.charAt(0);
} else {
// Just use this character, but first advance to the next one
text.next();
// Do Hangul -> Jamo decomposition if necessary
if (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) {
initBuffer();
hangulToJamo(ch, buffer, minDecomp);
bufferLimit = buffer.length() - 1;
ch = buffer.charAt(0);
}
}
if (DEBUG) System.out.println(" nextDecomp returning " + Utility.hex(ch) + ", text index=" + text.getIndex());
return ch;
}
/**
* Decompose starting with the input char just before the current position
* and continuing backward until (and including) the previous base char.
* <p>
* <b>Input</b>:
* <ul>
* <li>underlying char iter points just after last char to decompose
* </ul>
* <p>
* <b>Output:</b>
* <ul>
* <li>returns last char of resulting decomposition sequence
* <li>underlying iter points to lowest-index char we decomposed, i.e. the base char
* </ul>
*/
private char prevDecomp() {
if (DEBUG) System.out.println("--------------- top of prevDecomp() ---------------");
boolean hangul = (options & IGNORE_HANGUL) == 0;
char ch = curBackward();
int offset = DecompData.offsets.elementAt(ch);
int index = offset & DecompData.DECOMP_MASK;
if (DEBUG) System.out.println("prevDecomp got input char " + Utility.hex(ch));
if (index > minDecomp || DecompData.canonClass.elementAt(ch) != DecompData.BASE)
{
initBuffer();
// This method rewritten to pass conformance tests. - Liu
// Collect all characters up to the previous base char
while (ch != DONE) {
buffer.insert(0, ch);
if (DecompData.canonClass.elementAt(ch) == DecompData.BASE) break;
ch = text.previous();
}
if (DEBUG) System.out.println("prevDecomp buffer: " + Utility.hex(buffer));
// Decompose the buffer
for (int i = 0; i < buffer.length(); i++) {
ch = buffer.charAt(i);
offset = DecompData.offsets.elementAt(ch);
index = offset & DecompData.DECOMP_MASK;
if (index > minDecomp) {
int j = doReplace(DecompData.contents, index, buffer, i);
if ((offset & DecompData.DECOMP_RECURSE) != 0) {
// Need to decompose this recursively
for (; i < j; ++i) {
ch = buffer.charAt(i);
index = DecompData.offsets.elementAt(ch) & DecompData.DECOMP_MASK;
if (index > minDecomp) {
i += doReplace(DecompData.contents, index, buffer, i);
}
}
}
i = j;
}
}
if (DEBUG) System.out.println("prevDecomp buffer after decomp: " + Utility.hex(buffer));
if (buffer.length() > 1) {
// If there is more than one combining character in the buffer,
// put them into the canonical order.
fixCanonical(buffer);
}
bufferLimit = bufferPos = buffer.length() - 1;
ch = buffer.charAt(bufferPos);
}
else if (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) {
initBuffer();
hangulToJamo(ch, buffer, minDecomp);
bufferLimit = bufferPos = buffer.length() - 1;
ch = buffer.charAt(bufferPos);
}
if (DEBUG) System.out.println(" prevDecomp returning '" + ch + "' " + Utility.hex(ch) + ", text index=" + text.getIndex());
return ch;
}
static final int getClass(char ch) {
int value = DecompData.canonClass.elementAt(ch);
return (value >= 0) ? value : value + 256;
}
//-------------------------------------------------------------------------
// CharacterIterator overrides
//-------------------------------------------------------------------------
/**
* Return the current character in the normalized text.
*/
public char current() {
if (currentChar == DONE) {
if (mode.compose()) {
currentChar = nextCompose();
}
else if (mode.decomp()) {
currentChar = nextDecomp();
}
else {
currentChar = text.current();
}
}
return currentChar;
}
/**
* Return the first character in the normalized text. This resets
* the <tt>Normalizer's</tt> position to the beginning of the text.
*/
public char first() {
return setIndex(text.getBeginIndex());
}
/**
* Return the last character in the normalized text. This resets
* the <tt>Normalizer's</tt> position to be just before the
* the input text corresponding to that normalized character.
*/
public char last() {
text.setIndex(text.getEndIndex() - 1); // Setting to getEndIndex() fails in 1.1
atEnd = true; // so work around the bug
currentChar = DONE; // The current char hasn't been processed
clearBuffer(); // The buffer is empty too
return previous();
}
/**
* Return the next character in the normalized text and advance
* the iteration position by one. If the end
* of the text has already been reached, {@link #DONE} is returned.
*/
public char next() {
if (bufferPos < bufferLimit) {
// There are output characters left in the buffer
currentChar = buffer.charAt(++bufferPos);
}
else {
bufferLimit = bufferPos = 0; // Buffer is now out of date
if (mode.compose()) {
currentChar = nextCompose();
}
else if (mode.decomp()) {
currentChar = nextDecomp();
}
else {
currentChar = text.next();
}
}
return currentChar;
}
/**
* Return the previous character in the normalized text and decrement
* the iteration position by one. If the beginning
* of the text has already been reached, {@link #DONE} is returned.
*/
public char previous() {
if (bufferPos > 0) {
// There are output characters left in the buffer
currentChar = buffer.charAt(--bufferPos);
}
else {
bufferLimit = bufferPos = 0; // Buffer is now out of date
if (mode.compose()) {
currentChar = prevCompose();
}
else if (mode.decomp()) {
currentChar = prevDecomp();
}
else {
currentChar = text.previous();
}
}
return currentChar;
}
/**
* Set the iteration position in the input text that is being normalized
* and return the first normalized character at that position.
* <p>
* @param index the desired index in the input text.
*
* @return the first normalized character that is the result of iterating
* forward starting at the given index.
*
* @throws IllegalArgumentException if the given index is less than
* {@link #getBeginIndex} or greater than {@link #getEndIndex}.
*/
public char setIndex(int index) {
text.setIndex(index); // Checks range
currentChar = DONE; // The current char hasn't been processed
clearBuffer(); // The buffer is empty too
return current();
}
/**
* Retrieve the current iteration position in the input text that is
* being normalized. This method is useful in applications such as
* searching, where you need to be able to determine the position in
* the input text that corresponds to a given normalized output character.
*/
public final int getIndex() {
return text.getIndex();
}
/**
* Retrieve the index of the start of the input text. This is the begin index
* of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
* over which this <tt>Normalizer</tt> is iterating
*/
public final int getBeginIndex() {
return text.getBeginIndex();
}
/**
* Retrieve the index of the end of the input text. This is the end index
* of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
* over which this <tt>Normalizer</tt> is iterating
*/
public final int getEndIndex() {
return text.getEndIndex();
}
//-------------------------------------------------------------------------
// Property access methods
//-------------------------------------------------------------------------
/**
* Set the normalization mode for this object.
* <p>
* <b>Note:</b>If the normalization mode is changed while iterating
* over a string, calls to {@link #next} and {@link #previous} may
* return previously buffers characters in the old normalization mode
* until the iteration is able to re-sync at the next base character.
* It is safest to call {@link #setText setText()}, {@link #first},
* {@link #last}, etc. after calling <tt>setMode</tt>.
* <p>
* @param newMode the new mode for this <tt>Normalizer</tt>.
* The supported modes are:
* <ul>
* <li>{@link #COMPOSE} - Unicode canonical decompositiion
* followed by canonical composition.
* <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion
* follwed by canonical composition.
* <li>{@link #DECOMP} - Unicode canonical decomposition
* <li>{@link #DECOMP_COMPAT} - Unicode compatibility decomposition.
* <li>{@link #NO_OP} - Do nothing but return characters
* from the underlying input text.
* </ul>
*
* @see #getMode
*/
public void setMode(Mode newMode) {
mode = newMode;
minDecomp = mode.compat() ? 0 : DecompData.MAX_COMPAT;
}
/**
* Return the basic operation performed by this <tt>Normalizer</tt>
*
* @see #setMode
*/
public Mode getMode() {
return mode;
}
/**
* Set options that affect this <tt>Normalizer</tt>'s operation.
* Options do not change the basic composition or decomposition operation
* that is being performed , but they control whether
* certain optional portions of the operation are done.
* Currently the only available option is:
* <p>
* <ul>
* <li>{@link #IGNORE_HANGUL} - Do not decompose Hangul syllables into the Jamo alphabet
* and vice-versa. This option is off by default (<i>i.e.</i> Hangul processing
* is enabled) since the Unicode standard specifies that Hangul to Jamo
* is a canonical decomposition. For any of the standard Unicode Normalization
* Forms, you should leave this option off.
* </ul>
* <p>
* @param option the option whose value is to be set.
* @param value the new setting for the option. Use <tt>true</tt> to
* turn the option on and <tt>false</tt> to turn it off.
*
* @see #getOption
*/
public void setOption(int option, boolean value) {
if (option != IGNORE_HANGUL) {
throw new IllegalArgumentException("Illegal option");
}
if (value) {
options |= option;
} else {
options &= (~option);
}
}
/**
* Determine whether an option is turned on or off.
* <p>
* @see #setOption
*/
public boolean getOption(int option) {
return (options & option) != 0;
}
/**
* Set the input text over which this <tt>Normalizer</tt> will iterate.
* The iteration position will be reset to the beginning.
* <p>
* @param newText The new string to be normalized.
*/
public void setText(String newText) {
text = new StringCharacterIterator(newText);
reset();
}
/**
* Set the input text over which this <tt>Normalizer</tt> will iterate.
* The iteration position will be reset to the beginning.
* <p>
* @param newText The new text to be normalized.
*/
public void setText(CharacterIterator newText) {
text = newText;
reset();
}
//-------------------------------------------------------------------------
// Private utility methods
//-------------------------------------------------------------------------
private final char curForward() {
char ch = text.current();
if (DEBUG) System.out.println(" curForward returning " + Utility.hex(ch) + ", text index=" + text.getIndex());
return ch;
}
private final char curBackward() {
char ch = atEnd ? text.current() : text.previous();
atEnd = false;
if (DEBUG) System.out.println(" curBackward returning " + Utility.hex(ch) + ", text index=" + text.getIndex());
return ch;
}
static final int doAppend(String source, int offset, StringBuffer dest) {
int index = offset >>> STR_INDEX_SHIFT;
int length = offset & STR_LENGTH_MASK;
if (length == 0) {
char ch;
while ((ch = DecompData.contents.charAt(index++)) != 0x0000) {
dest.append(ch);
length++;
}
} else {
for (int i = 0; i < length; i++) {
dest.append(DecompData.contents.charAt(index++));
}
}
return length;
}
static final int doInsert(String source, int offset, StringBuffer dest, int pos)
{
int index = offset >>> STR_INDEX_SHIFT;
int length = offset & STR_LENGTH_MASK;
if (length == 0) {
char ch;
while ((ch = DecompData.contents.charAt(index++)) != 0x0000) {
dest.insert(pos++, ch);
length++;
}
} else {
for (int i = 0; i < length; i++) {
dest.insert(pos++, DecompData.contents.charAt(index++));
}
}
return length;
}
static final int doReplace(String source, int offset, StringBuffer dest, int pos)
{
int index = offset >>> STR_INDEX_SHIFT;
int length = offset & STR_LENGTH_MASK;
dest.setCharAt(pos++, DecompData.contents.charAt(index++));
if (length == 0) {
char ch;
while ((ch = DecompData.contents.charAt(index++)) != 0x0000) {
dest.insert(pos++, ch);
length++;
}
} else {
for (int i = 1; i < length; i++) {
dest.insert(pos++, DecompData.contents.charAt(index++));
}
}
return length;
}
private void reset() {
text.setIndex(text.getBeginIndex());
atEnd = false;
bufferPos = 0;
bufferLimit = 0;
}
private final void initBuffer() {
if (buffer == null) {
buffer = new StringBuffer(10);
} else {
buffer.setLength(0);
}
clearBuffer();
}
private final void clearBuffer() {
bufferLimit = bufferPos = 0;
}
/**
* Fixes the sorting sequence of non-spacing characters according to
* their combining class. The algorithm is listed on p.3-11 in the
* Unicode Standard 2.0. The table of combining classes is on p.4-2
* in the Unicode Standard 2.0.
* @param result the string to fix.
*/
private static void fixCanonical(StringBuffer result) {
if (result.length() == 0) return; // don't bother with empty strings!
int i = result.length() - 1;
int currentType = getClass(result.charAt(i));
int lastType;
for (--i; i >= 0; --i) {
lastType = currentType;
currentType = getClass(result.charAt(i));
//
// a swap is presumed to be rare (and a double-swap very rare),
// so don't worry about efficiency here.
//
if (currentType > lastType && lastType != DecompData.BASE) {
// swap characters
char temp = result.charAt(i);
result.setCharAt(i, result.charAt(i+1));
result.setCharAt(i+1, temp);
// if not at end, backup (one further, to compensate for for-loop)
if (i < result.length() - 2) {
i += 2;
}
// reset type, since we swapped.
currentType = getClass(result.charAt(i));
}
}
}
//-------------------------------------------------------------------------
// Hangul / Jamo conversion utilities for internal use
// See section 3.10 of The Unicode Standard, v 2.0.
//
// Package-accessible for use by ComposedCharIter
static final char HANGUL_BASE = 0xac00;
static final char HANGUL_LIMIT = 0xd7a4;
private static final char JAMO_LBASE = 0x1100;
private static final char JAMO_VBASE = 0x1161;
private static final char JAMO_TBASE = 0x11a7;
private static final int JAMO_LCOUNT = 19;
private static final int JAMO_VCOUNT = 21;
private static final int JAMO_TCOUNT = 28;
private static final int JAMO_NCOUNT = JAMO_VCOUNT * JAMO_TCOUNT;
/**
* Convert a single Hangul syllable into one or more Jamo characters.
*
* @param conjoin If true, decompose Jamo into conjoining Jamo.
*/
static int hangulToJamo(char ch, StringBuffer result, int decompLimit) {
char sIndex = (char)(ch - HANGUL_BASE);
char leading = (char)(JAMO_LBASE + sIndex / JAMO_NCOUNT);
char vowel = (char)(JAMO_VBASE +
(sIndex % JAMO_NCOUNT) / JAMO_TCOUNT);
char trailing= (char)(JAMO_TBASE + (sIndex % JAMO_TCOUNT));
int length = 0;
length += jamoAppend(leading, decompLimit, result);
length += jamoAppend(vowel, decompLimit, result);
if (trailing != JAMO_TBASE) {
length += jamoAppend(trailing, decompLimit, result);
}
return length;
}
static final int jamoAppend(char ch, int limit, StringBuffer dest) {
int offset = DecompData.offsets.elementAt(ch);
if (offset > limit) {
return doAppend(DecompData.contents, offset, dest);
} else {
dest.append(ch);
return 1;
}
}
static private void jamoToHangul(StringBuffer buffer, int start) {
int out = 0;
int limit = buffer.length() - 1;
int in, l, v, t;
for (in = start; in < limit; in++) {
char ch = buffer.charAt(in);
if ((l = ch - JAMO_LBASE) >= 0 && l < JAMO_LCOUNT
&& (v = buffer.charAt(in+1) - JAMO_VBASE) >= 0 && v < JAMO_VCOUNT) {
//
// We've found a pair of Jamo characters to compose.
// Snarf the Jamo vowel and see if there's also a trailing char
//
in++; // Snarf the Jamo vowel too.
t = (in < limit) ? buffer.charAt(in+1) : 0;
t -= JAMO_TBASE;
if (t >= 0 && t < JAMO_TCOUNT) {
in++; // Snarf the trailing consonant too
} else {
t = 0; // No trailing consonant
}
buffer.setCharAt(out++, (char)((l*JAMO_VCOUNT + v) * JAMO_TCOUNT
+ t + HANGUL_BASE));
} else {
buffer.setCharAt(out++, ch);
}
}
while (in < buffer.length()) {
buffer.setCharAt(out++, buffer.charAt(in++));
}
buffer.setLength(out);
}
//-------------------------------------------------------------------------
// Private data
//-------------------------------------------------------------------------
private static final boolean DEBUG = false;
private Mode mode = DECOMP;
private int options = 0;
private transient int minDecomp;
// The input text and our position in it
private CharacterIterator text;
private boolean atEnd = false;
// A buffer for holding intermediate results
private StringBuffer buffer = null;
private int bufferPos = 0;
private int bufferLimit = 0;
private char currentChar;
// Another buffer for use during iterative composition
private static final int EMPTY = -1;
private StringBuffer explodeBuf = null;
// These must agree with the constants used in NormalizerBuilder
static final int STR_INDEX_SHIFT = 2;
static final int STR_LENGTH_MASK = 0x0003;
};