main/tests/core/src/com/ibm/icu/dev/test/normalizer/UnicodeNormalizer.java - external/github.com/unicode-org/icu - Git at Google

 /*
  * Copyright (C) 1998-2007 International Business Machines Corporation and
  * Unicode, Inc. All Rights Reserved.<br>
  * The Unicode Consortium makes no expressed or implied warranty of any
  * kind, and assumes no liability for errors or omissions.
  * No liability is assumed for incidental and consequential damages
  * in connection with or arising out of the use of the information here.
  */

 package com.ibm.icu.dev.test.normalizer;

 import com.ibm.icu.dev.test.UTF16Util;

 /**
  * Implements Unicode Normalization Forms C, D, KC, KD.<br>
  * See UTR#15 for details.<br>
  * @author Mark Davis
  * Updates for supplementary code points:
  * Vladimir Weinstein & Markus Scherer
  */
 public class UnicodeNormalizer {
 //    static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";

     /**
      * Create a normalizer for a given form.
      */
     public UnicodeNormalizer(byte form, boolean fullData) {
         this.form = form;
         if (data == null) data = NormalizerBuilder.build(fullData); // load 1st time
     }

     /**
     * Masks for the form selector
     */
     static final byte
         COMPATIBILITY_MASK = 1,
         COMPOSITION_MASK = 2;

     /**
     * Normalization Form Selector
     */
     public static final byte
         D = 0 ,
         C = COMPOSITION_MASK,
         KD = COMPATIBILITY_MASK,
         KC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);

     /**
     * Normalizes text according to the chosen form,
     * replacing contents of the target buffer.
     * @param   source      the original text, unnormalized
     * @param   target      the resulting normalized text
     */
     public StringBuffer normalize(String source, StringBuffer target) {

         // First decompose the source into target,
         // then compose if the form requires.

         if (source.length() != 0) {
             internalDecompose(source, target);
             if ((form & COMPOSITION_MASK) != 0) {
                 internalCompose(target);
             }
         }
         return target;
     }

     /**
     * Normalizes text according to the chosen form
     * @param   source      the original text, unnormalized
     * @return  target      the resulting normalized text
     */
     public String normalize(String source) {
         return normalize(source, new StringBuffer()).toString();
     }

     // ======================================
     //                  PRIVATES
     // ======================================

     /**
      * The current form.
      */
     private byte form;

     /**
     * Decomposes text, either canonical or compatibility,
     * replacing contents of the target buffer.
     * @param   form        the normalization form. If COMPATIBILITY_MASK
     *                      bit is on in this byte, then selects the recursive
     *                      compatibility decomposition, otherwise selects
     *                      the recursive canonical decomposition.
     * @param   source      the original text, unnormalized
     * @param   target      the resulting normalized text
     */
     private void internalDecompose(String source, StringBuffer target) {
         StringBuffer buffer = new StringBuffer();
         boolean canonical = (form & COMPATIBILITY_MASK) == 0;
         int ch;
         for (int i = 0; i < source.length();) {
             buffer.setLength(0);
             ch = UTF16Util.nextCodePoint(source, i);
             i+=UTF16Util.codePointLength(ch);
             data.getRecursiveDecomposition(canonical, ch, buffer);

             // add all of the characters in the decomposition.
             // (may be just the original character, if there was
             // no decomposition mapping)

             for (int j = 0; j < buffer.length();) {
                 ch = UTF16Util.nextCodePoint(buffer, j);
                 j+=UTF16Util.codePointLength(ch);
                 int chClass = data.getCanonicalClass(ch);
                 int k = target.length(); // insertion point
                 if (chClass != 0) {

                     // bubble-sort combining marks as necessary

                     int ch2;
                     for (; k > 0; k -= UTF16Util.codePointLength(ch2)) {
                         ch2 = UTF16Util.prevCodePoint(target, k);
                         if (data.getCanonicalClass(ch2) <= chClass) break;
                     }
                 }
                 UTF16Util.insertCodePoint(target, k, ch);
             }
         }
     }

     /**
     * Composes text in place. Target must already
     * have been decomposed.
     * @param   target      input: decomposed text.
     *                      output: the resulting normalized text.
     */
     private void internalCompose(StringBuffer target) {

         int starterPos = 0;
         int starterCh = UTF16Util.nextCodePoint(target,0);
         int compPos = UTF16Util.codePointLength(starterCh);
         int lastClass = data.getCanonicalClass(starterCh);
         if (lastClass != 0) lastClass = 256; // fix for irregular combining sequence

         // Loop on the decomposed characters, combining where possible

         for (int decompPos = UTF16Util.codePointLength(starterCh); decompPos < target.length(); ) {
             int ch = UTF16Util.nextCodePoint(target, decompPos);
             decompPos += UTF16Util.codePointLength(ch);
             int chClass = data.getCanonicalClass(ch);
             int composite = data.getPairwiseComposition(starterCh, ch);
             if (composite != NormalizerData.NOT_COMPOSITE
             && (lastClass < chClass || lastClass == 0)) {
                 UTF16Util.setCodePointAt(target, starterPos, composite);
                 starterCh = composite;
             } else {
                 if (chClass == 0) {
                     starterPos = compPos;
                     starterCh  = ch;
                 }
                 lastClass = chClass;
                 decompPos += UTF16Util.setCodePointAt(target, compPos, ch);
                 compPos += UTF16Util.codePointLength(ch);
             }
         }
         target.setLength(compPos);
     }

     /**
     * Contains normalization data from the Unicode Character Database.
     * use false for the minimal set, true for the real set.
     */
     private static NormalizerData data = null;

     /**
     * Just accessible for testing.
     */
     boolean getExcluded (char ch) {
         return data.getExcluded(ch);
     }

     /**
     * Just accessible for testing.
     */
     String getRawDecompositionMapping (char ch) {
         return data.getRawDecompositionMapping(ch);
     }
 }
	/*
	* Copyright (C) 1998-2007 International Business Machines Corporation and
	* Unicode, Inc. All Rights Reserved.<br>
	* The Unicode Consortium makes no expressed or implied warranty of any
	* kind, and assumes no liability for errors or omissions.
	* No liability is assumed for incidental and consequential damages
	* in connection with or arising out of the use of the information here.
	*/

	package com.ibm.icu.dev.test.normalizer;

	import com.ibm.icu.dev.test.UTF16Util;

	/**
	* Implements Unicode Normalization Forms C, D, KC, KD.<br>
	* See UTR#15 for details.<br>
	* @author Mark Davis
	* Updates for supplementary code points:
	* Vladimir Weinstein & Markus Scherer
	*/
	public class UnicodeNormalizer {
	// static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";

	/**
	* Create a normalizer for a given form.
	*/
	public UnicodeNormalizer(byte form, boolean fullData) {
	this.form = form;
	if (data == null) data = NormalizerBuilder.build(fullData); // load 1st time
	}

	/**
	* Masks for the form selector
	*/
	static final byte
	COMPATIBILITY_MASK = 1,
	COMPOSITION_MASK = 2;

	/**
	* Normalization Form Selector
	*/
	public static final byte
	D = 0 ,
	C = COMPOSITION_MASK,
	KD = COMPATIBILITY_MASK,
	KC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);

	/**
	* Normalizes text according to the chosen form,
	* replacing contents of the target buffer.
	* @param source the original text, unnormalized
	* @param target the resulting normalized text
	*/
	public StringBuffer normalize(String source, StringBuffer target) {

	// First decompose the source into target,
	// then compose if the form requires.

	if (source.length() != 0) {
	internalDecompose(source, target);
	if ((form & COMPOSITION_MASK) != 0) {
	internalCompose(target);
	}
	}
	return target;
	}

	/**
	* Normalizes text according to the chosen form
	* @param source the original text, unnormalized
	* @return target the resulting normalized text
	*/
	public String normalize(String source) {
	return normalize(source, new StringBuffer()).toString();
	}

	// ======================================
	// PRIVATES
	// ======================================

	/**
	* The current form.
	*/
	private byte form;

	/**
	* Decomposes text, either canonical or compatibility,
	* replacing contents of the target buffer.
	* @param form the normalization form. If COMPATIBILITY_MASK
	* bit is on in this byte, then selects the recursive
	* compatibility decomposition, otherwise selects
	* the recursive canonical decomposition.
	* @param source the original text, unnormalized
	* @param target the resulting normalized text
	*/
	private void internalDecompose(String source, StringBuffer target) {
	StringBuffer buffer = new StringBuffer();
	boolean canonical = (form & COMPATIBILITY_MASK) == 0;
	int ch;
	for (int i = 0; i < source.length();) {
	buffer.setLength(0);
	ch = UTF16Util.nextCodePoint(source, i);
	i+=UTF16Util.codePointLength(ch);
	data.getRecursiveDecomposition(canonical, ch, buffer);

	// add all of the characters in the decomposition.
	// (may be just the original character, if there was
	// no decomposition mapping)

	for (int j = 0; j < buffer.length();) {
	ch = UTF16Util.nextCodePoint(buffer, j);
	j+=UTF16Util.codePointLength(ch);
	int chClass = data.getCanonicalClass(ch);
	int k = target.length(); // insertion point
	if (chClass != 0) {

	// bubble-sort combining marks as necessary

	int ch2;
	for (; k > 0; k -= UTF16Util.codePointLength(ch2)) {
	ch2 = UTF16Util.prevCodePoint(target, k);
	if (data.getCanonicalClass(ch2) <= chClass) break;
	}
	}
	UTF16Util.insertCodePoint(target, k, ch);
	}
	}
	}

	/**
	* Composes text in place. Target must already
	* have been decomposed.
	* @param target input: decomposed text.
	* output: the resulting normalized text.
	*/
	private void internalCompose(StringBuffer target) {

	int starterPos = 0;
	int starterCh = UTF16Util.nextCodePoint(target,0);
	int compPos = UTF16Util.codePointLength(starterCh);
	int lastClass = data.getCanonicalClass(starterCh);
	if (lastClass != 0) lastClass = 256; // fix for irregular combining sequence

	// Loop on the decomposed characters, combining where possible

	for (int decompPos = UTF16Util.codePointLength(starterCh); decompPos < target.length(); ) {
	int ch = UTF16Util.nextCodePoint(target, decompPos);
	decompPos += UTF16Util.codePointLength(ch);
	int chClass = data.getCanonicalClass(ch);
	int composite = data.getPairwiseComposition(starterCh, ch);
	if (composite != NormalizerData.NOT_COMPOSITE
	&& (lastClass < chClass \|\| lastClass == 0)) {
	UTF16Util.setCodePointAt(target, starterPos, composite);
	starterCh = composite;
	} else {
	if (chClass == 0) {
	starterPos = compPos;
	starterCh = ch;
	}
	lastClass = chClass;
	decompPos += UTF16Util.setCodePointAt(target, compPos, ch);
	compPos += UTF16Util.codePointLength(ch);
	}
	}
	target.setLength(compPos);
	}

	/**
	* Contains normalization data from the Unicode Character Database.
	* use false for the minimal set, true for the real set.
	*/
	private static NormalizerData data = null;

	/**
	* Just accessible for testing.
	*/
	boolean getExcluded (char ch) {
	return data.getExcluded(ch);
	}

	/**
	* Just accessible for testing.
	*/
	String getRawDecompositionMapping (char ch) {
	return data.getRawDecompositionMapping(ch);
	}
	}