main/classes/charset/src/com/ibm/icu/charset/CharsetSelector.java - external/github.com/unicode-org/icu - Git at Google

 /*
  ******************************************************************************
  * Copyright (C) 1996-2010, International Business Machines Corporation and   *
  * others. All Rights Reserved.                                               *
  ******************************************************************************
  */

 /*
  * This is a port of the C++ class UConverterSelector.
  *
  * Methods related to serialization are not ported in this version. In addition,
  * the selectForUTF8 method is not going to be ported, as UTF8 is seldom used
  * in Java.
  *
  * @author Shaopeng Jia
  */

 package com.ibm.icu.charset;

 import java.nio.charset.Charset;
 import java.nio.charset.IllegalCharsetNameException;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.ArrayList;
 import java.util.List;

 import com.ibm.icu.impl.IntTrie;
 import com.ibm.icu.impl.PropsVectors;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;

 /**
  * Charset Selector
  *
  * A charset selector is built with a list of charset names and given an input
  * CharSequence returns the list of names the corresponding charsets which can
  * convert the CharSequence.
  *
  * @stable ICU 4.2
  */
 public final class CharsetSelector {
     private IntTrie trie;
     private int[] pv; // table of bits
     private String[] encodings; // encodings users ask to use

     private void generateSelectorData(PropsVectors pvec,
             UnicodeSet excludedCodePoints, int mappingTypes) {
         int columns = (encodings.length + 31) / 32;

         // set errorValue to all-ones
         for (int col = 0; col < columns; ++col) {
             pvec.setValue(PropsVectors.ERROR_VALUE_CP,
                     PropsVectors.ERROR_VALUE_CP, col, ~0, ~0);
         }

         for (int i = 0; i < encodings.length; ++i) {
             Charset testCharset = CharsetICU.forNameICU(encodings[i]);
             UnicodeSet unicodePointSet = new UnicodeSet(); // empty set
             ((CharsetICU) testCharset).getUnicodeSet(unicodePointSet,
                     mappingTypes);
             int column = i / 32;
             int mask = 1 << (i % 32);
             // now iterate over intervals on set i
             int itemCount = unicodePointSet.getRangeCount();
             for (int j = 0; j < itemCount; ++j) {
                 int startChar = unicodePointSet.getRangeStart(j);
                 int endChar = unicodePointSet.getRangeEnd(j);
                 pvec.setValue(startChar, endChar, column, ~0, mask);
             }
         }

         // handle excluded encodings
         // Simply set their values to all 1's in the pvec
         if (!excludedCodePoints.isEmpty()) {
             int itemCount = excludedCodePoints.getRangeCount();
             for (int j = 0; j < itemCount; ++j) {
                 int startChar = excludedCodePoints.getRangeStart(j);
                 int endChar = excludedCodePoints.getRangeEnd(j);
                 for (int col = 0; col < columns; col++) {
                     pvec.setValue(startChar, endChar, col, ~0, ~0);
                 }
             }
         }

         trie = pvec.compactToTrieWithRowIndexes();
         pv = pvec.getCompactedArray();
     }

     // internal function to intersect two sets of masks
     // returns whether the mask has reduced to all zeros. The
     // second set of mask consists of len elements in pv starting from
     // pvIndex
     private boolean intersectMasks(int[] dest, int pvIndex, int len) {
         int oredDest = 0;
         for (int i = 0; i < len; ++i) {
             oredDest |= (dest[i] &= pv[pvIndex + i]);
         }
         return oredDest == 0;
     }

     // internal function
     private List<String> selectForMask(int[] mask) {
         // this is the context we will use. Store a table of indices to which
         // encodings are legit

         List<String> result = new ArrayList<String>();
         int columns = (encodings.length + 31) / 32;
         int numOnes = countOnes(mask, columns);

         // now we know the exact space we need to index
         if (numOnes > 0) {
             int k = 0;
             for (int j = 0; j < columns; j++) {
                 int v = mask[j];
                 for (int i = 0; i < 32 && k < encodings.length; i++, k++) {
                     if ((v & 1) != 0) {
                         result.add(encodings[k]);
                     }
                     v >>= 1;
                 }
             }
         }

         // otherwise, index will remain NULL
         return result;
     }

     // internal function to count how many 1's are there in a mask
     // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html
     private int countOnes(int[] mask, int len) {
         int totalOnes = 0;
         for (int i = 0; i < len; ++i) {
             int ent = mask[i];
             for (; ent != 0; totalOnes++) {
                 ent &= ent - 1; // clear the least significant bit set
             }
         }
         return totalOnes;
     }

     /**
      * Construct a CharsetSelector from a list of charset names.
      *
      * @param charsetList
      *            a list of charset names in the form of strings. If charsetList
      *            is empty, a selector for all available charset is constructed.
      * @param excludedCodePoints
      *            a set of code points to be excluded from consideration.
      *            Excluded code points appearing in the input CharSequence do
      *            not change the selection result. It could be empty when no
      *            code point should be excluded.
      * @param mappingTypes
      *            an int which determines whether to consider only roundtrip
      *            mappings or also fallbacks, e.g. CharsetICU.ROUNDTRIP_SET. See
      *            CharsetICU.java for the constants that are currently
      *            supported.
      * @throws IllegalArgumentException
      *             if the parameters is invalid.
      * @throws IllegalCharsetNameException
      *             If the given charset name is illegal.
      * @throws UnsupportedCharsetException
      *             If no support for the named charset is available in this
      *             instance of the Java virtual machine.
      * @stable ICU 4.2
      */
     public CharsetSelector(List<String> charsetList, UnicodeSet excludedCodePoints,
             int mappingTypes) {
         if (mappingTypes != CharsetICU.ROUNDTRIP_AND_FALLBACK_SET
                 && mappingTypes != CharsetICU.ROUNDTRIP_SET) {
             throw new IllegalArgumentException("Unsupported mappingTypes");
         }

         int encodingCount = charsetList.size();
         if (encodingCount > 0) {
             encodings = charsetList.toArray(new String[0]);
         } else {
             encodings = CharsetProviderICU.getAvailableNames();
             encodingCount = encodings.length;
         }

         PropsVectors pvec = new PropsVectors((encodingCount + 31) / 32);
         generateSelectorData(pvec, excludedCodePoints, mappingTypes);
     }

     /**
      * Select charsets that can map all characters in a CharSequence, ignoring
      * the excluded code points.
      *
      * @param unicodeText
      *            a CharSequence. It could be empty.
      * @return a list that contains charset names in the form of strings. The
      *         returned encoding names and their order will be the same as
      *         supplied when building the selector.
      *
      * @stable ICU 4.2
      */
     public List<String> selectForString(CharSequence unicodeText) {
         int columns = (encodings.length + 31) / 32;
         int[] mask = new int[columns];
         for (int i = 0; i < columns; i++) {
             mask[i] = - 1; // set each bit to 1
                            // Note: All integers are signed in Java, assigning
                            // 2 ^ 32 -1 to mask is wrong!
         }
         int index = 0;
         while (index < unicodeText.length()) {
             int c = UTF16.charAt(unicodeText, index);
             int pvIndex = trie.getCodePointValue(c);
             index += UTF16.getCharCount(c);
             if (intersectMasks(mask, pvIndex, columns)) {
                 break;
             }
         }
         return selectForMask(mask);
     }
 }
	/*
	******************************************************************************
	* Copyright (C) 1996-2010, International Business Machines Corporation and *
	* others. All Rights Reserved. *
	******************************************************************************
	*/

	/*
	* This is a port of the C++ class UConverterSelector.
	*
	* Methods related to serialization are not ported in this version. In addition,
	* the selectForUTF8 method is not going to be ported, as UTF8 is seldom used
	* in Java.
	*
	* @author Shaopeng Jia
	*/

	package com.ibm.icu.charset;

	import java.nio.charset.Charset;
	import java.nio.charset.IllegalCharsetNameException;
	import java.nio.charset.UnsupportedCharsetException;
	import java.util.ArrayList;
	import java.util.List;

	import com.ibm.icu.impl.IntTrie;
	import com.ibm.icu.impl.PropsVectors;
	import com.ibm.icu.text.UTF16;
	import com.ibm.icu.text.UnicodeSet;

	/**
	* Charset Selector
	*
	* A charset selector is built with a list of charset names and given an input
	* CharSequence returns the list of names the corresponding charsets which can
	* convert the CharSequence.
	*
	* @stable ICU 4.2
	*/
	public final class CharsetSelector {
	private IntTrie trie;
	private int[] pv; // table of bits
	private String[] encodings; // encodings users ask to use

	private void generateSelectorData(PropsVectors pvec,
	UnicodeSet excludedCodePoints, int mappingTypes) {
	int columns = (encodings.length + 31) / 32;

	// set errorValue to all-ones
	for (int col = 0; col < columns; ++col) {
	pvec.setValue(PropsVectors.ERROR_VALUE_CP,
	PropsVectors.ERROR_VALUE_CP, col, ~0, ~0);
	}

	for (int i = 0; i < encodings.length; ++i) {
	Charset testCharset = CharsetICU.forNameICU(encodings[i]);
	UnicodeSet unicodePointSet = new UnicodeSet(); // empty set
	((CharsetICU) testCharset).getUnicodeSet(unicodePointSet,
	mappingTypes);
	int column = i / 32;
	int mask = 1 << (i % 32);
	// now iterate over intervals on set i
	int itemCount = unicodePointSet.getRangeCount();
	for (int j = 0; j < itemCount; ++j) {
	int startChar = unicodePointSet.getRangeStart(j);
	int endChar = unicodePointSet.getRangeEnd(j);
	pvec.setValue(startChar, endChar, column, ~0, mask);
	}
	}

	// handle excluded encodings
	// Simply set their values to all 1's in the pvec
	if (!excludedCodePoints.isEmpty()) {
	int itemCount = excludedCodePoints.getRangeCount();
	for (int j = 0; j < itemCount; ++j) {
	int startChar = excludedCodePoints.getRangeStart(j);
	int endChar = excludedCodePoints.getRangeEnd(j);
	for (int col = 0; col < columns; col++) {
	pvec.setValue(startChar, endChar, col, ~0, ~0);
	}
	}
	}

	trie = pvec.compactToTrieWithRowIndexes();
	pv = pvec.getCompactedArray();
	}

	// internal function to intersect two sets of masks
	// returns whether the mask has reduced to all zeros. The
	// second set of mask consists of len elements in pv starting from
	// pvIndex
	private boolean intersectMasks(int[] dest, int pvIndex, int len) {
	int oredDest = 0;
	for (int i = 0; i < len; ++i) {
	oredDest \|= (dest[i] &= pv[pvIndex + i]);
	}
	return oredDest == 0;
	}

	// internal function
	private List<String> selectForMask(int[] mask) {
	// this is the context we will use. Store a table of indices to which
	// encodings are legit

	List<String> result = new ArrayList<String>();
	int columns = (encodings.length + 31) / 32;
	int numOnes = countOnes(mask, columns);

	// now we know the exact space we need to index
	if (numOnes > 0) {
	int k = 0;
	for (int j = 0; j < columns; j++) {
	int v = mask[j];
	for (int i = 0; i < 32 && k < encodings.length; i++, k++) {
	if ((v & 1) != 0) {
	result.add(encodings[k]);
	}
	v >>= 1;
	}
	}
	}

	// otherwise, index will remain NULL
	return result;
	}

	// internal function to count how many 1's are there in a mask
	// algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html
	private int countOnes(int[] mask, int len) {
	int totalOnes = 0;
	for (int i = 0; i < len; ++i) {
	int ent = mask[i];
	for (; ent != 0; totalOnes++) {
	ent &= ent - 1; // clear the least significant bit set
	}
	}
	return totalOnes;
	}

	/**
	* Construct a CharsetSelector from a list of charset names.
	*
	* @param charsetList
	* a list of charset names in the form of strings. If charsetList
	* is empty, a selector for all available charset is constructed.
	* @param excludedCodePoints
	* a set of code points to be excluded from consideration.
	* Excluded code points appearing in the input CharSequence do
	* not change the selection result. It could be empty when no
	* code point should be excluded.
	* @param mappingTypes
	* an int which determines whether to consider only roundtrip
	* mappings or also fallbacks, e.g. CharsetICU.ROUNDTRIP_SET. See
	* CharsetICU.java for the constants that are currently
	* supported.
	* @throws IllegalArgumentException
	* if the parameters is invalid.
	* @throws IllegalCharsetNameException
	* If the given charset name is illegal.
	* @throws UnsupportedCharsetException
	* If no support for the named charset is available in this
	* instance of the Java virtual machine.
	* @stable ICU 4.2
	*/
	public CharsetSelector(List<String> charsetList, UnicodeSet excludedCodePoints,
	int mappingTypes) {
	if (mappingTypes != CharsetICU.ROUNDTRIP_AND_FALLBACK_SET
	&& mappingTypes != CharsetICU.ROUNDTRIP_SET) {
	throw new IllegalArgumentException("Unsupported mappingTypes");
	}

	int encodingCount = charsetList.size();
	if (encodingCount > 0) {
	encodings = charsetList.toArray(new String[0]);
	} else {
	encodings = CharsetProviderICU.getAvailableNames();
	encodingCount = encodings.length;
	}

	PropsVectors pvec = new PropsVectors((encodingCount + 31) / 32);
	generateSelectorData(pvec, excludedCodePoints, mappingTypes);
	}

	/**
	* Select charsets that can map all characters in a CharSequence, ignoring
	* the excluded code points.
	*
	* @param unicodeText
	* a CharSequence. It could be empty.
	* @return a list that contains charset names in the form of strings. The
	* returned encoding names and their order will be the same as
	* supplied when building the selector.
	*
	* @stable ICU 4.2
	*/
	public List<String> selectForString(CharSequence unicodeText) {
	int columns = (encodings.length + 31) / 32;
	int[] mask = new int[columns];
	for (int i = 0; i < columns; i++) {
	mask[i] = - 1; // set each bit to 1
	// Note: All integers are signed in Java, assigning
	// 2 ^ 32 -1 to mask is wrong!
	}
	int index = 0;
	while (index < unicodeText.length()) {
	int c = UTF16.charAt(unicodeText, index);
	int pvIndex = trie.getCodePointValue(c);
	index += UTF16.getCharCount(c);
	if (intersectMasks(mask, pvIndex, columns)) {
	break;
	}
	}
	return selectForMask(mask);
	}
	}