blob: ed9d722ec2d50f74a63c06b697a3b34537e6e519 [file] [log] [blame]
/*
* Copyright (C) 1998-2007 International Business Machines Corporation and
* Unicode, Inc. All Rights Reserved.<br>
* The Unicode Consortium makes no expressed or implied warranty of any
* kind, and assumes no liability for errors or omissions.
* No liability is assumed for incidental and consequential damages
* in connection with or arising out of the use of the information here.
*/
package com.ibm.icu.dev.test.normalizer;
import com.ibm.icu.dev.test.UTF16Util;
/**
* Implements Unicode Normalization Forms C, D, KC, KD.<br>
* See UTR#15 for details.<br>
* @author Mark Davis
* Updates for supplementary code points:
* Vladimir Weinstein & Markus Scherer
*/
public class UnicodeNormalizer {
// static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
/**
* Create a normalizer for a given form.
*/
public UnicodeNormalizer(byte form, boolean fullData) {
this.form = form;
if (data == null) data = NormalizerBuilder.build(fullData); // load 1st time
}
/**
* Masks for the form selector
*/
static final byte
COMPATIBILITY_MASK = 1,
COMPOSITION_MASK = 2;
/**
* Normalization Form Selector
*/
public static final byte
D = 0 ,
C = COMPOSITION_MASK,
KD = COMPATIBILITY_MASK,
KC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);
/**
* Normalizes text according to the chosen form,
* replacing contents of the target buffer.
* @param source the original text, unnormalized
* @param target the resulting normalized text
*/
public StringBuffer normalize(String source, StringBuffer target) {
// First decompose the source into target,
// then compose if the form requires.
if (source.length() != 0) {
internalDecompose(source, target);
if ((form & COMPOSITION_MASK) != 0) {
internalCompose(target);
}
}
return target;
}
/**
* Normalizes text according to the chosen form
* @param source the original text, unnormalized
* @return target the resulting normalized text
*/
public String normalize(String source) {
return normalize(source, new StringBuffer()).toString();
}
// ======================================
// PRIVATES
// ======================================
/**
* The current form.
*/
private byte form;
/**
* Decomposes text, either canonical or compatibility,
* replacing contents of the target buffer.
* @param form the normalization form. If COMPATIBILITY_MASK
* bit is on in this byte, then selects the recursive
* compatibility decomposition, otherwise selects
* the recursive canonical decomposition.
* @param source the original text, unnormalized
* @param target the resulting normalized text
*/
private void internalDecompose(String source, StringBuffer target) {
StringBuffer buffer = new StringBuffer();
boolean canonical = (form & COMPATIBILITY_MASK) == 0;
int ch;
for (int i = 0; i < source.length();) {
buffer.setLength(0);
ch = UTF16Util.nextCodePoint(source, i);
i+=UTF16Util.codePointLength(ch);
data.getRecursiveDecomposition(canonical, ch, buffer);
// add all of the characters in the decomposition.
// (may be just the original character, if there was
// no decomposition mapping)
for (int j = 0; j < buffer.length();) {
ch = UTF16Util.nextCodePoint(buffer, j);
j+=UTF16Util.codePointLength(ch);
int chClass = data.getCanonicalClass(ch);
int k = target.length(); // insertion point
if (chClass != 0) {
// bubble-sort combining marks as necessary
int ch2;
for (; k > 0; k -= UTF16Util.codePointLength(ch2)) {
ch2 = UTF16Util.prevCodePoint(target, k);
if (data.getCanonicalClass(ch2) <= chClass) break;
}
}
UTF16Util.insertCodePoint(target, k, ch);
}
}
}
/**
* Composes text in place. Target must already
* have been decomposed.
* @param target input: decomposed text.
* output: the resulting normalized text.
*/
private void internalCompose(StringBuffer target) {
int starterPos = 0;
int starterCh = UTF16Util.nextCodePoint(target,0);
int compPos = UTF16Util.codePointLength(starterCh);
int lastClass = data.getCanonicalClass(starterCh);
if (lastClass != 0) lastClass = 256; // fix for irregular combining sequence
// Loop on the decomposed characters, combining where possible
for (int decompPos = UTF16Util.codePointLength(starterCh); decompPos < target.length(); ) {
int ch = UTF16Util.nextCodePoint(target, decompPos);
decompPos += UTF16Util.codePointLength(ch);
int chClass = data.getCanonicalClass(ch);
int composite = data.getPairwiseComposition(starterCh, ch);
if (composite != NormalizerData.NOT_COMPOSITE
&& (lastClass < chClass || lastClass == 0)) {
UTF16Util.setCodePointAt(target, starterPos, composite);
starterCh = composite;
} else {
if (chClass == 0) {
starterPos = compPos;
starterCh = ch;
}
lastClass = chClass;
decompPos += UTF16Util.setCodePointAt(target, compPos, ch);
compPos += UTF16Util.codePointLength(ch);
}
}
target.setLength(compPos);
}
/**
* Contains normalization data from the Unicode Character Database.
* use false for the minimal set, true for the real set.
*/
private static NormalizerData data = null;
/**
* Just accessible for testing.
*/
boolean getExcluded (char ch) {
return data.getExcluded(ch);
}
/**
* Just accessible for testing.
*/
String getRawDecompositionMapping (char ch) {
return data.getRawDecompositionMapping(ch);
}
}