| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| * Copyright (C) 1998-2007 International Business Machines Corporation and |
| * Unicode, Inc. All Rights Reserved.<br> |
| * The Unicode Consortium makes no expressed or implied warranty of any |
| * kind, and assumes no liability for errors or omissions. |
| * No liability is assumed for incidental and consequential damages |
| * in connection with or arising out of the use of the information here. |
| */ |
| |
| package com.ibm.icu.dev.test.normalizer; |
| |
| import com.ibm.icu.dev.test.UTF16Util; |
| |
| /** |
| * Implements Unicode Normalization Forms C, D, KC, KD.<br> |
| * See UTR#15 for details.<br> |
| * @author Mark Davis |
| * Updates for supplementary code points: |
| * Vladimir Weinstein & Markus Scherer |
| */ |
| public class UnicodeNormalizer { |
| // static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc."; |
| |
| /** |
| * Create a normalizer for a given form. |
| */ |
| public UnicodeNormalizer(byte form, boolean fullData) { |
| this.form = form; |
| if (data == null) data = NormalizerBuilder.build(fullData); // load 1st time |
| } |
| |
| /** |
| * Masks for the form selector |
| */ |
| static final byte |
| COMPATIBILITY_MASK = 1, |
| COMPOSITION_MASK = 2; |
| |
| /** |
| * Normalization Form Selector |
| */ |
| public static final byte |
| D = 0 , |
| C = COMPOSITION_MASK, |
| KD = COMPATIBILITY_MASK, |
| KC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK); |
| |
| /** |
| * Normalizes text according to the chosen form, |
| * replacing contents of the target buffer. |
| * @param source the original text, unnormalized |
| * @param target the resulting normalized text |
| */ |
| public StringBuffer normalize(String source, StringBuffer target) { |
| |
| // First decompose the source into target, |
| // then compose if the form requires. |
| |
| if (source.length() != 0) { |
| internalDecompose(source, target); |
| if ((form & COMPOSITION_MASK) != 0) { |
| internalCompose(target); |
| } |
| } |
| return target; |
| } |
| |
| /** |
| * Normalizes text according to the chosen form |
| * @param source the original text, unnormalized |
| * @return target the resulting normalized text |
| */ |
| public String normalize(String source) { |
| return normalize(source, new StringBuffer()).toString(); |
| } |
| |
| // ====================================== |
| // PRIVATES |
| // ====================================== |
| |
| /** |
| * The current form. |
| */ |
| private byte form; |
| |
| /** |
| * Decomposes text, either canonical or compatibility, |
| * replacing contents of the target buffer. |
| * @param form the normalization form. If COMPATIBILITY_MASK |
| * bit is on in this byte, then selects the recursive |
| * compatibility decomposition, otherwise selects |
| * the recursive canonical decomposition. |
| * @param source the original text, unnormalized |
| * @param target the resulting normalized text |
| */ |
| private void internalDecompose(String source, StringBuffer target) { |
| StringBuffer buffer = new StringBuffer(); |
| boolean canonical = (form & COMPATIBILITY_MASK) == 0; |
| int ch; |
| for (int i = 0; i < source.length();) { |
| buffer.setLength(0); |
| ch = UTF16Util.nextCodePoint(source, i); |
| i+=UTF16Util.codePointLength(ch); |
| data.getRecursiveDecomposition(canonical, ch, buffer); |
| |
| // add all of the characters in the decomposition. |
| // (may be just the original character, if there was |
| // no decomposition mapping) |
| |
| for (int j = 0; j < buffer.length();) { |
| ch = UTF16Util.nextCodePoint(buffer, j); |
| j+=UTF16Util.codePointLength(ch); |
| int chClass = data.getCanonicalClass(ch); |
| int k = target.length(); // insertion point |
| if (chClass != 0) { |
| |
| // bubble-sort combining marks as necessary |
| |
| int ch2; |
| for (; k > 0; k -= UTF16Util.codePointLength(ch2)) { |
| ch2 = UTF16Util.prevCodePoint(target, k); |
| if (data.getCanonicalClass(ch2) <= chClass) break; |
| } |
| } |
| UTF16Util.insertCodePoint(target, k, ch); |
| } |
| } |
| } |
| |
| /** |
| * Composes text in place. Target must already |
| * have been decomposed. |
| * @param target input: decomposed text. |
| * output: the resulting normalized text. |
| */ |
| private void internalCompose(StringBuffer target) { |
| |
| int starterPos = 0; |
| int starterCh = UTF16Util.nextCodePoint(target,0); |
| int compPos = UTF16Util.codePointLength(starterCh); |
| int lastClass = data.getCanonicalClass(starterCh); |
| if (lastClass != 0) lastClass = 256; // fix for irregular combining sequence |
| |
| // Loop on the decomposed characters, combining where possible |
| |
| for (int decompPos = UTF16Util.codePointLength(starterCh); decompPos < target.length(); ) { |
| int ch = UTF16Util.nextCodePoint(target, decompPos); |
| decompPos += UTF16Util.codePointLength(ch); |
| int chClass = data.getCanonicalClass(ch); |
| int composite = data.getPairwiseComposition(starterCh, ch); |
| if (composite != NormalizerData.NOT_COMPOSITE |
| && (lastClass < chClass || lastClass == 0)) { |
| UTF16Util.setCodePointAt(target, starterPos, composite); |
| starterCh = composite; |
| } else { |
| if (chClass == 0) { |
| starterPos = compPos; |
| starterCh = ch; |
| } |
| lastClass = chClass; |
| decompPos += UTF16Util.setCodePointAt(target, compPos, ch); |
| compPos += UTF16Util.codePointLength(ch); |
| } |
| } |
| target.setLength(compPos); |
| } |
| |
| /** |
| * Contains normalization data from the Unicode Character Database. |
| * use false for the minimal set, true for the real set. |
| */ |
| private static NormalizerData data = null; |
| |
| /** |
| * Just accessible for testing. |
| */ |
| boolean getExcluded (char ch) { |
| return data.getExcluded(ch); |
| } |
| |
| /** |
| * Just accessible for testing. |
| */ |
| String getRawDecompositionMapping (char ch) { |
| return data.getRawDecompositionMapping(ch); |
| } |
| } |