unicodetools/com/ibm/text/UCD/NormalizerSample.java - external/github.com/unicode-org/icu - Git at Google

 package com.ibm.text.UCD;

 import java.util.*;
 import com.ibm.text.*;
 import com.ibm.icu.text.UTF16;

 import com.ibm.text.utility.*;


 /**
  * Implements Unicode Normalization Forms C, D, KC, KD.<br>
  * See UTR#15 for details.<br>
  * Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
  * The Unicode Consortium makes no expressed or implied warranty of any
  * kind, and assumes no liability for errors or omissions.
  * No liability is assumed for incidental and consequential damages
  * in connection with or arising out of the use of the information here.
  * @author Mark Davis
  */

 public class NormalizerSample implements UCD_Types {
     static final String copyright = "Copyright (C) 2001, IBM Corp. and Unicode Inc. All Rights Reserved.";

     public static boolean SHOW_PROGRESS = false;

     /**
      * Create a normalizer for a given form.
      */
     public NormalizerSample(byte form, String unicodeVersion) {
         this.composition = (form & COMPOSITION_MASK) != 0;
         this.compatibility = (form & COMPATIBILITY_MASK) != 0;
         this.data = getData(unicodeVersion);
     }

     /**
      * Create a normalizer for a given form.
      */
     public NormalizerSample(byte form) {
         this(form,"");
     }

     /**
     * Masks for the form selector
     */
     public static final byte
         COMPATIBILITY_MASK = 1,
         COMPOSITION_MASK = 2;

     /**
     * Normalization Form Selector
     */
     public static final byte
         NFD = 0 ,
         NFKD = COMPATIBILITY_MASK,
         NFC = COMPOSITION_MASK,
         NFKC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);

     /**
     * Normalizes text according to the chosen form,
     * replacing contents of the target buffer.
     * @param   source      the original text, unnormalized
     * @param   target      the resulting normalized text
     */
     public StringBuffer normalize(String source, StringBuffer target) {

         // First decompose the source into target,
         // then compose if the form requires.

         if (source.length() != 0) {
             internalDecompose(source, target);
             if (composition) {
                 internalCompose(target);
             }
         }
         return target;
     }

     /**
     * Normalizes text according to the chosen form
     * @param   source      the original text, unnormalized
     * @return  target      the resulting normalized text
     */
     public String normalize(String source) {
         return normalize(source, new StringBuffer()).toString();
     }

     /**
     * Normalizes text according to the chosen form
     * @param   newLocaleID      the original text, unnormalized
     * @return  target      the resulting normalized text
     */
     public String normalize(int cp) {
         return normalize(UTF16.valueOf(cp));
     }

     /**
     */
     private StringBuffer hasDecompositionBuffer = new StringBuffer();

     public boolean hasDecomposition(int cp) {
         hasDecompositionBuffer.setLength(0);
         normalize(UTF16.valueOf(cp), hasDecompositionBuffer);
         if (hasDecompositionBuffer.length() != 1) return true;
         return cp != hasDecompositionBuffer.charAt(0);
     }

     /**
     * Utility: Checks whether there is a recursive decomposition of a character from the
     * Unicode Character Database. It is compatibility or canonical according to the particular
     * normalizer.
     * @param   ch      the source character
     */
     public boolean normalizationDiffers(int ch) {
         return data.normalizationDiffers(ch, composition, compatibility);
     }

     /**
     * Utility: Gets recursive decomposition of a character from the
     * Unicode Character Database.
     * @param   compatibility    If false selects the recursive
     *                  canonical decomposition, otherwise selects
     *                  the recursive compatibility AND canonical decomposition.
     * @param   ch      the source character
     * @param   buffer  buffer to be filled with the decomposition
     */
     public void getRecursiveDecomposition(char ch, StringBuffer buffer) {
         data.getRecursiveDecomposition(ch, buffer, compatibility);
     }


     // ======================================
     //                  PRIVATES
     // ======================================

     /**
      * The current form.
      */
     private boolean composition;
     private boolean compatibility;

     /**
     * Decomposes text, either canonical or compatibility,
     * replacing contents of the target buffer.
     * @param   form        the normalization form. If COMPATIBILITY_MASK
     *                      bit is on in this byte, then selects the recursive
     *                      compatibility decomposition, otherwise selects
     *                      the recursive canonical decomposition.
     * @param   source      the original text, unnormalized
     * @param   target      the resulting normalized text
     */
     private void internalDecompose(String source, StringBuffer target) {
         StringBuffer buffer = new StringBuffer();
         int ch32;
         for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) {
             buffer.setLength(0);
             ch32 = UTF16.charAt(source, i);
             data.getRecursiveDecomposition(ch32, buffer, compatibility);

             // add all of the characters in the decomposition.
             // (may be just the original character, if there was
             // no decomposition mapping)

             int ch;
             for (int j = 0; j < buffer.length(); j += UTF16.getCharCount(ch)) {
                 ch = UTF16.charAt(buffer, j);
                 int chClass = data.getCanonicalClass(ch);
                 int k = target.length(); // insertion point
                 if (chClass != 0) {

                     // bubble-sort combining marks as necessary

                     int ch2;
                     for (; k > 0; k -= UTF16.getCharCount(ch2)) {
                         ch2 = UTF16.charAt(target, k-1);
                         if (data.getCanonicalClass(ch2) <= chClass) break;
                     }
                 }
                 target.insert(k, UTF16.valueOf(ch));
             }
         }
     }

     /**
     * Composes text in place. Target must already
     * have been decomposed.
     * Uses UTF16, which is a utility class for supplementary character support in Java.
     * @param   target      input: decomposed text.
     *                      output: the resulting normalized text.
     */
     private void internalCompose(StringBuffer target) {
         int starterPos = 0;
         int starterCh = UTF16.charAt(target,0);
         int compPos = UTF16.getCharCount(starterCh); // length of last composition
         int lastClass = data.getCanonicalClass(starterCh);
         if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark
         int oldLen = target.length();

         // Loop on the decomposed characters, combining where possible

         int ch;
         for (int decompPos = compPos; decompPos < target.length(); decompPos += UTF16.getCharCount(ch)) {
             ch = UTF16.charAt(target, decompPos);
             if (SHOW_PROGRESS) System.out.println(Utility.hex(target)
                 + ", decompPos: " + decompPos
                 + ", compPos: " + compPos
                 + ", ch: " + Utility.hex(ch)
                 );
             int chClass = data.getCanonicalClass(ch);
             int composite = data.getPairwiseComposition(starterCh, ch);
             if (composite != data.NOT_COMPOSITE
             && (lastClass < chClass || lastClass == 0)) {
                 UTF16.setCharAt(target, starterPos, composite);
                 // we know that we will only be replacing non-supplementaries by non-supplementaries
                 // so we don't have to adjust the decompPos
                 starterCh = composite;
             } else {
                 if (chClass == 0) {
                     starterPos = compPos;
                     starterCh  = ch;
                 }
                 lastClass = chClass;
                 UTF16.setCharAt(target, compPos, ch);
                 if (target.length() != oldLen) { // MAY HAVE TO ADJUST!
                     System.out.println("ADJUSTING: " + Utility.hex(target));
                     decompPos += target.length() - oldLen;
                     oldLen = target.length();
                 }
                 compPos += UTF16.getCharCount(ch);
             }
         }
         target.setLength(compPos);
     }

     // The following class makes use of the UCD class, which accesses data in the Unicode Character Database

     static class Stub {
         private UCD ucd;
         private HashMap compTable = new HashMap();
         private BitSet isSecond = new BitSet();
         private BitSet canonicalRecompose = new BitSet();
         private BitSet compatibilityRecompose = new BitSet();
         static final int NOT_COMPOSITE = 0xFFFF;

         Stub(String version) {
             ucd = UCD.make(version);
             for (int i = 0; i < 0x10FFFF; ++i) {
                 if (!ucd.isAssigned(i)) continue;
                 if (ucd.isPUA(i)) continue;
                 if (ucd.isNonLeadJamo(i)) isSecond.set(i);
                 byte dt = ucd.getDecompositionType(i);
                 if (dt != CANONICAL) continue;
                 if (!ucd.getBinaryProperty(i, CompositionExclusion)) {
                     try {
                         String s = ucd.getDecompositionMapping(i);
                         int len = UTF16.countCodePoint(s);
                         if (len != 2) {
                             if (len > 2) throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i));
                             continue;
                         }
                         int a = UTF16.charAt(s, 0);
                         if (ucd.getCombiningClass(a) != 0) continue;

                         int b = UTF16.charAt(s, UTF16.getCharCount(a));
                         isSecond.set(b);

                         // have a recomposition, so set the bit
                         canonicalRecompose.set(i);

                         // set the compatibility recomposition bit
                         // ONLY if the component characters
                         // don't compatibility decompose
                         if (ucd.getDecompositionType(a) <= CANONICAL
                          && ucd.getDecompositionType(b) <= CANONICAL) {
                             compatibilityRecompose.set(i);
                          }

                         long key = (((long)a)<<32) | b;

                         compTable.put(new Long(key), new Integer(i));
                     } catch (Exception e) {
                         throw new ChainException("Error: {0}", new Object[]{ucd.toString(i)}, e);
                     }
                 }
             }
         }

         short getCanonicalClass(int cp) {
             return ucd.getCombiningClass(cp);
         }

         boolean isTrailing(int cp) {
             return isSecond.get(cp);
         }

         boolean normalizationDiffers(int cp, boolean composition, boolean compatibility) {
             byte dt = ucd.getDecompositionType(cp);
             if (!composition) {
                 if (compatibility) return dt >= CANONICAL;
                 else return dt == CANONICAL;
             } else {
                 // almost the same, except that we add back in the characters
                 // that RECOMPOSE
                 if (compatibility) return dt >= CANONICAL && !compatibilityRecompose.get(cp);
                 else return dt == CANONICAL && !canonicalRecompose.get(cp);
             }
         }

         public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compatibility) {
             byte dt = ucd.getDecompositionType(cp);
             // we know we decompose all CANONICAL, plus > CANONICAL if compatibility is TRUE.
             if (dt == CANONICAL || dt > CANONICAL && compatibility) {
                 String s = ucd.getDecompositionMapping(cp);
                 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
                     cp = UTF16.charAt(s, i);
                     getRecursiveDecomposition(cp, buffer, compatibility);
                 }
             } else {
                 UTF16.append(buffer, cp);
             }
         }

         int getPairwiseComposition(int starterCh, int ch) {
             int hangulPoss = UCD.composeHangul(starterCh, ch);
             if (hangulPoss != 0xFFFF) return hangulPoss;
             Object obj = compTable.get(new Long((((long)starterCh)<<32) | ch));
             if (obj == null) return 0xFFFF;
             return ((Integer)obj).intValue();
         }

     }

     /**
     * Contains normalization data from the Unicode Character Database.
     * use false for the minimal set, true for the real set.
     */
     private Stub data;

     private static HashMap versionCache = new HashMap();

     private static Stub getData (String version) {
         if (version.length() == 0) version = UCD.latestVersion;
         Stub result = (Stub)versionCache.get(version);
         if (result == null) {
             result = new Stub(version);
             versionCache.put(version, result);
         }
         return result;
     }
 }
	package com.ibm.text.UCD;

	import java.util.*;
	import com.ibm.text.*;
	import com.ibm.icu.text.UTF16;

	import com.ibm.text.utility.*;


	/**
	* Implements Unicode Normalization Forms C, D, KC, KD.<br>
	* See UTR#15 for details.<br>
	* Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
	* The Unicode Consortium makes no expressed or implied warranty of any
	* kind, and assumes no liability for errors or omissions.
	* No liability is assumed for incidental and consequential damages
	* in connection with or arising out of the use of the information here.
	* @author Mark Davis
	*/

	public class NormalizerSample implements UCD_Types {
	static final String copyright = "Copyright (C) 2001, IBM Corp. and Unicode Inc. All Rights Reserved.";

	public static boolean SHOW_PROGRESS = false;

	/**
	* Create a normalizer for a given form.
	*/
	public NormalizerSample(byte form, String unicodeVersion) {
	this.composition = (form & COMPOSITION_MASK) != 0;
	this.compatibility = (form & COMPATIBILITY_MASK) != 0;
	this.data = getData(unicodeVersion);
	}

	/**
	* Create a normalizer for a given form.
	*/
	public NormalizerSample(byte form) {
	this(form,"");
	}

	/**
	* Masks for the form selector
	*/
	public static final byte
	COMPATIBILITY_MASK = 1,
	COMPOSITION_MASK = 2;

	/**
	* Normalization Form Selector
	*/
	public static final byte
	NFD = 0 ,
	NFKD = COMPATIBILITY_MASK,
	NFC = COMPOSITION_MASK,
	NFKC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);

	/**
	* Normalizes text according to the chosen form,
	* replacing contents of the target buffer.
	* @param source the original text, unnormalized
	* @param target the resulting normalized text
	*/
	public StringBuffer normalize(String source, StringBuffer target) {

	// First decompose the source into target,
	// then compose if the form requires.

	if (source.length() != 0) {
	internalDecompose(source, target);
	if (composition) {
	internalCompose(target);
	}
	}
	return target;
	}

	/**
	* Normalizes text according to the chosen form
	* @param source the original text, unnormalized
	* @return target the resulting normalized text
	*/
	public String normalize(String source) {
	return normalize(source, new StringBuffer()).toString();
	}

	/**
	* Normalizes text according to the chosen form
	* @param newLocaleID the original text, unnormalized
	* @return target the resulting normalized text
	*/
	public String normalize(int cp) {
	return normalize(UTF16.valueOf(cp));
	}

	/**
	*/
	private StringBuffer hasDecompositionBuffer = new StringBuffer();

	public boolean hasDecomposition(int cp) {
	hasDecompositionBuffer.setLength(0);
	normalize(UTF16.valueOf(cp), hasDecompositionBuffer);
	if (hasDecompositionBuffer.length() != 1) return true;
	return cp != hasDecompositionBuffer.charAt(0);
	}

	/**
	* Utility: Checks whether there is a recursive decomposition of a character from the
	* Unicode Character Database. It is compatibility or canonical according to the particular
	* normalizer.
	* @param ch the source character
	*/
	public boolean normalizationDiffers(int ch) {
	return data.normalizationDiffers(ch, composition, compatibility);
	}

	/**
	* Utility: Gets recursive decomposition of a character from the
	* Unicode Character Database.
	* @param compatibility If false selects the recursive
	* canonical decomposition, otherwise selects
	* the recursive compatibility AND canonical decomposition.
	* @param ch the source character
	* @param buffer buffer to be filled with the decomposition
	*/
	public void getRecursiveDecomposition(char ch, StringBuffer buffer) {
	data.getRecursiveDecomposition(ch, buffer, compatibility);
	}


	// ======================================
	// PRIVATES
	// ======================================

	/**
	* The current form.
	*/
	private boolean composition;
	private boolean compatibility;

	/**
	* Decomposes text, either canonical or compatibility,
	* replacing contents of the target buffer.
	* @param form the normalization form. If COMPATIBILITY_MASK
	* bit is on in this byte, then selects the recursive
	* compatibility decomposition, otherwise selects
	* the recursive canonical decomposition.
	* @param source the original text, unnormalized
	* @param target the resulting normalized text
	*/
	private void internalDecompose(String source, StringBuffer target) {
	StringBuffer buffer = new StringBuffer();
	int ch32;
	for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) {
	buffer.setLength(0);
	ch32 = UTF16.charAt(source, i);
	data.getRecursiveDecomposition(ch32, buffer, compatibility);

	// add all of the characters in the decomposition.
	// (may be just the original character, if there was
	// no decomposition mapping)

	int ch;
	for (int j = 0; j < buffer.length(); j += UTF16.getCharCount(ch)) {
	ch = UTF16.charAt(buffer, j);
	int chClass = data.getCanonicalClass(ch);
	int k = target.length(); // insertion point
	if (chClass != 0) {

	// bubble-sort combining marks as necessary

	int ch2;
	for (; k > 0; k -= UTF16.getCharCount(ch2)) {
	ch2 = UTF16.charAt(target, k-1);
	if (data.getCanonicalClass(ch2) <= chClass) break;
	}
	}
	target.insert(k, UTF16.valueOf(ch));
	}
	}
	}

	/**
	* Composes text in place. Target must already
	* have been decomposed.
	* Uses UTF16, which is a utility class for supplementary character support in Java.
	* @param target input: decomposed text.
	* output: the resulting normalized text.
	*/
	private void internalCompose(StringBuffer target) {
	int starterPos = 0;
	int starterCh = UTF16.charAt(target,0);
	int compPos = UTF16.getCharCount(starterCh); // length of last composition
	int lastClass = data.getCanonicalClass(starterCh);
	if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark
	int oldLen = target.length();

	// Loop on the decomposed characters, combining where possible

	int ch;
	for (int decompPos = compPos; decompPos < target.length(); decompPos += UTF16.getCharCount(ch)) {
	ch = UTF16.charAt(target, decompPos);
	if (SHOW_PROGRESS) System.out.println(Utility.hex(target)
	+ ", decompPos: " + decompPos
	+ ", compPos: " + compPos
	+ ", ch: " + Utility.hex(ch)
	);
	int chClass = data.getCanonicalClass(ch);
	int composite = data.getPairwiseComposition(starterCh, ch);
	if (composite != data.NOT_COMPOSITE
	&& (lastClass < chClass \|\| lastClass == 0)) {
	UTF16.setCharAt(target, starterPos, composite);
	// we know that we will only be replacing non-supplementaries by non-supplementaries
	// so we don't have to adjust the decompPos
	starterCh = composite;
	} else {
	if (chClass == 0) {
	starterPos = compPos;
	starterCh = ch;
	}
	lastClass = chClass;
	UTF16.setCharAt(target, compPos, ch);
	if (target.length() != oldLen) { // MAY HAVE TO ADJUST!
	System.out.println("ADJUSTING: " + Utility.hex(target));
	decompPos += target.length() - oldLen;
	oldLen = target.length();
	}
	compPos += UTF16.getCharCount(ch);
	}
	}
	target.setLength(compPos);
	}

	// The following class makes use of the UCD class, which accesses data in the Unicode Character Database

	static class Stub {
	private UCD ucd;
	private HashMap compTable = new HashMap();
	private BitSet isSecond = new BitSet();
	private BitSet canonicalRecompose = new BitSet();
	private BitSet compatibilityRecompose = new BitSet();
	static final int NOT_COMPOSITE = 0xFFFF;

	Stub(String version) {
	ucd = UCD.make(version);
	for (int i = 0; i < 0x10FFFF; ++i) {
	if (!ucd.isAssigned(i)) continue;
	if (ucd.isPUA(i)) continue;
	if (ucd.isNonLeadJamo(i)) isSecond.set(i);
	byte dt = ucd.getDecompositionType(i);
	if (dt != CANONICAL) continue;
	if (!ucd.getBinaryProperty(i, CompositionExclusion)) {
	try {
	String s = ucd.getDecompositionMapping(i);
	int len = UTF16.countCodePoint(s);
	if (len != 2) {
	if (len > 2) throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i));
	continue;
	}
	int a = UTF16.charAt(s, 0);
	if (ucd.getCombiningClass(a) != 0) continue;

	int b = UTF16.charAt(s, UTF16.getCharCount(a));
	isSecond.set(b);

	// have a recomposition, so set the bit
	canonicalRecompose.set(i);

	// set the compatibility recomposition bit
	// ONLY if the component characters
	// don't compatibility decompose
	if (ucd.getDecompositionType(a) <= CANONICAL
	&& ucd.getDecompositionType(b) <= CANONICAL) {
	compatibilityRecompose.set(i);
	}

	long key = (((long)a)<<32) \| b;

	compTable.put(new Long(key), new Integer(i));
	} catch (Exception e) {
	throw new ChainException("Error: {0}", new Object[]{ucd.toString(i)}, e);
	}
	}
	}
	}

	short getCanonicalClass(int cp) {
	return ucd.getCombiningClass(cp);
	}

	boolean isTrailing(int cp) {
	return isSecond.get(cp);
	}

	boolean normalizationDiffers(int cp, boolean composition, boolean compatibility) {
	byte dt = ucd.getDecompositionType(cp);
	if (!composition) {
	if (compatibility) return dt >= CANONICAL;
	else return dt == CANONICAL;
	} else {
	// almost the same, except that we add back in the characters
	// that RECOMPOSE
	if (compatibility) return dt >= CANONICAL && !compatibilityRecompose.get(cp);
	else return dt == CANONICAL && !canonicalRecompose.get(cp);
	}
	}

	public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compatibility) {
	byte dt = ucd.getDecompositionType(cp);
	// we know we decompose all CANONICAL, plus > CANONICAL if compatibility is TRUE.
	if (dt == CANONICAL \|\| dt > CANONICAL && compatibility) {
	String s = ucd.getDecompositionMapping(cp);
	for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
	cp = UTF16.charAt(s, i);
	getRecursiveDecomposition(cp, buffer, compatibility);
	}
	} else {
	UTF16.append(buffer, cp);
	}
	}

	int getPairwiseComposition(int starterCh, int ch) {
	int hangulPoss = UCD.composeHangul(starterCh, ch);
	if (hangulPoss != 0xFFFF) return hangulPoss;
	Object obj = compTable.get(new Long((((long)starterCh)<<32) \| ch));
	if (obj == null) return 0xFFFF;
	return ((Integer)obj).intValue();
	}

	}

	/**
	* Contains normalization data from the Unicode Character Database.
	* use false for the minimal set, true for the real set.
	*/
	private Stub data;

	private static HashMap versionCache = new HashMap();

	private static Stub getData (String version) {
	if (version.length() == 0) version = UCD.latestVersion;
	Stub result = (Stub)versionCache.get(version);
	if (result == null) {
	result = new Stub(version);
	versionCache.put(version, result);
	}
	return result;
	}
	}