main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java - external/github.com/unicode-org/icu - Git at Google

 /*
  *******************************************************************************
  * Copyright (C) 2008-2011, Google Inc, International Business Machines Corporation
  * and others. All Rights Reserved.
  *******************************************************************************
  */
 package com.ibm.icu.text;

 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;

 import com.ibm.icu.impl.MultiComparator;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UProperty;
 import com.ibm.icu.lang.UScript;
 import com.ibm.icu.text.AlphabeticIndex.Bucket;
 import com.ibm.icu.util.LocaleData;
 import com.ibm.icu.util.ULocale;

 /**
  * AlphabeticIndex supports the creation of a UI index appropriate for a given language. It can support either direct
  * use, or use with a client that doesn't support localized collation. The following is an example of what an index
  * might look like in a UI:
  *
  * <pre>
  *  <b>... A B C D E F G H I J K L M N O P Q R S T U V W X Y Z  ...</b>
  *
  *  <b>A</b>
  *     Addison
  *     Albertson
  *     Azensky
  *  <b>B</b>
  *     Baecker
  *  ...
  * </pre>
  *
  * The class can generate a list of labels for use as a UI "index", that is, a list of clickable characters (or
  * character sequences) that allow the user to see a segment (bucket) of a larger "target" list. That is, each label
  * corresponds to a bucket in the target list, where everything in the bucket is greater than or equal to the character
  * (according to the locale's collation). Strings can be added to the index; they will be in sorted order in the right
  * bucket.</p>
  * <p>
  * The class also supports having buckets for strings before the first (underflow), after the last (overflow), and
  * between scripts (inflow). For example, if the index is constructed with labels for Russian and English, Greek
  * characters would fall into an inflow bucket between the other two scripts.</p>
  *
  * <p><em>Note:</em> If you expect to have a lot of ASCII or Latin characters as well as characters from the user's language, then it is a good idea to call addLabels(ULocale.English).</p>
  *
  * <h2>Direct Use</h2>
  * <p>The following shows an example of building an index directly.
  *  The "show..." methods below are just to illustrate usage.
  *
  * <pre>
  * // Create a simple index where the values for the strings are Integers, and add the strings
  *
  * AlphabeticIndex<Integer> index = new AlphabeticIndex<Integer>(desiredLocale).addLabels(additionalLocale);
  * int counter = 0;
  * for (String item : test) {
  *     index.addRecord(item, counter++);
  * }
  * ...
  * // Show index at top. We could skip or gray out empty buckets
  *
  * for (AlphabeticIndex.Bucket<Integer> bucket : index) {
  *     if (showAll || bucket.size() != 0) {
  *         showLabelAtTop(UI, bucket.getLabel());
  *     }
  * }
  *  ...
  * // Show the buckets with their contents, skipping empty buckets
  *
  * for (AlphabeticIndex.Bucket<Integer> bucket : index) {
  *     if (bucket.size() != 0) {
  *         showLabelInList(UI, bucket.getLabel());
  *         for (AlphabeticIndex.Record<Integer> item : bucket) {
  *             showIndexedItem(UI, item.getName(), item.getData());
  *         }
  * </pre>
  *
  * The caller can build different UIs using this class. For example, an index character could be omitted or grayed-out
  * if its bucket is empty. Small buckets could also be combined based on size, such as:
  *
  * <pre>
  * <b>... A-F G-N O-Z ...</b>
  * </pre>
  *
  * <h2>Client Support</h2>
  * <p>
  * Callers can also use the AlphabeticIndex to support sorting on a client that doesn't support collation.
  * <ul>
  * <li>getLabels() can be used to get a list of the labels, such as "...", "A", "B",..., and send that list to the client.
  * </li>
  * <li>When the client has a new name, it sends that name to the server. The server needs to call the following methods,
  * and communicate the bucketIndex and collationKey back to the client.
  *
  * <pre>
  * int bucketIndex = alphabeticIndex.getBucketIndex(name);
  * RawCollationKey collationKey = collator.getRawCollationKey(name, null);
  * </pre>
  *
  * <li>The client would put the name (and associated information) into its bucket for bucketIndex. The collationKey is a
  * sequence of bytes that can be compared with a binary compare, and produce the right localized result.</li>
  * </ul>
  *
  * <p>
  * <b>Notes:</b>
  * <ul>
  * <li>Additional collation parameters can be passed in as part of the locale name. For example, German plus numeric
  * sorting would be "de@kn-true".
  *
  * @author markdavis
  * @draft ICU 4.6
  * @provisional This API might change or be removed in a future release.
  */
 public final class AlphabeticIndex<V> implements Iterable<Bucket<V>> {

     /**
      * Internals
      */
     static final boolean HACK_CODED_FIRSTS = true;

     private static UnicodeSet UNIHAN = new UnicodeSet("[:script=Hani:]");

     private static final char CGJ = '\u034F';
     private static final UnicodeSet ALPHABETIC = new UnicodeSet("[[:alphabetic:]-[:mark:]]");
     private static final UnicodeSet HANGUL = new UnicodeSet(
     "[\uAC00 \uB098 \uB2E4 \uB77C \uB9C8 \uBC14  \uC0AC  \uC544 \uC790  \uCC28 \uCE74 \uD0C0 \uD30C \uD558]");
     private static final UnicodeSet ETHIOPIC = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
     private static final UnicodeSet CORE_LATIN = new UnicodeSet("[a-z]");

     private final RuleBasedCollator collatorOriginal;
     private final RuleBasedCollator collatorPrimaryOnly;
     private RuleBasedCollator collatorExternal;

     // for testing
     private final LinkedHashMap<String, Set<String>> alreadyIn = new LinkedHashMap<String, Set<String>>();
     private final List<String> noDistinctSorting = new ArrayList<String>();
     private final List<String> notAlphabetic = new ArrayList<String>();

     // We accumulate these as we build up the input parameters

     private final UnicodeSet initialLabels = new UnicodeSet();
     private final Collection<Record<V>> inputList = new ArrayList<Record<V>>();

     // Lazy evaluated: null means that we have not built yet.

     private BucketList buckets;

     private String overflowLabel = "\u2026";
     private String underflowLabel = "\u2026";
     private String inflowLabel = "\u2026";
     private LangType langType;

     /**
      * Create the index object.
      *
      * @param locale
      *            The locale for the index.
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public AlphabeticIndex(ULocale locale) {
         this(locale, null, getIndexExemplars(locale));
     }

     /**
      * Create the index object.
      *
      * @param locale
      *            The locale for the index.
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public AlphabeticIndex(Locale locale) {
         this(ULocale.forLocale(locale));
     }

     /**
      * @internal
      * @deprecated This API is ICU internal only, for testing purposes and use with CLDR.
      */
     public enum LangType {
         /**
          * @internal
          * @deprecated This API is ICU internal only, for testing purposes and use with CLDR.
          */
         NORMAL,
         /**
          * @internal
          * @deprecated This API is ICU internal only, for testing purposes and use with CLDR.
          */
         SIMPLIFIED,
         /**
          * @internal
          * @deprecated This API is ICU internal only, for testing purposes and use with CLDR.
          */
         TRADITIONAL;
         /**
          * @internal
          * @deprecated This API is ICU internal only, for testing purposes and use with CLDR.
          */
         public static LangType fromLocale(ULocale locale) {
             String lang = locale.getLanguage();
             if (lang.equals("zh")) {
                 if ("Hant".equals(locale.getScript()) || "TW".equals(locale.getCountry())) {
                     return TRADITIONAL;
                 }
                 return SIMPLIFIED;
             }
             return NORMAL;
         }
     }

     /**
      * @internal
      * @deprecated This API is ICU internal only, for testing purposes and use with CLDR.
      */
     public AlphabeticIndex(ULocale locale, RuleBasedCollator collator, UnicodeSet exemplarChars) {
         langType = LangType.fromLocale(locale);
         // HACK because we have to know the type of the collation for Chinese
         if (langType != LangType.NORMAL) {
             locale = locale.setKeywordValue("collation", langType == LangType.TRADITIONAL ? "stroke" : "pinyin");
         }
         collatorOriginal = collator != null ? collator : (RuleBasedCollator) Collator.getInstance(locale);
         try {
             collatorPrimaryOnly = (RuleBasedCollator) (collatorOriginal.clone());
         } catch (Exception e) {
             // should never happen
             throw new IllegalStateException("Collator cannot be cloned", e);
         }
         collatorPrimaryOnly.setStrength(Collator.PRIMARY);
         addLabels(exemplarChars);
     }

     /**
      * Add more index characters (aside from what are in the locale)
      * @param additions additional characters to add to the index, such as A-Z.
      * @return this, for chaining
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public AlphabeticIndex<V> addLabels(UnicodeSet additions) {
         initialLabels.addAll(additions);
         buckets = null;
         return this;
     }

     /**
      * Add more index characters (aside from what are in the locale)
      * @param additions additional characters to add to the index, such as those in Swedish.
      * @return this, for chaining
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public AlphabeticIndex<V> addLabels(ULocale... additions) {
         for (ULocale addition : additions) {
             initialLabels.addAll(getIndexExemplars(addition));
         }
         buckets = null;
         return this;
     }

     /**
      * Add more index characters (aside from what are in the locale)
      * @param additions additional characters to add to the index, such as those in Swedish.
      * @return this, for chaining
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public AlphabeticIndex<V> addLabels(Locale... additions) {
         for (Locale addition : additions) {
             initialLabels.addAll(getIndexExemplars(ULocale.forLocale(addition)));
         }
         buckets = null;
         return this;
     }

     /**
      * Set the overflow label
      * @param overflowLabel see class description
      * @return this, for chaining
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public AlphabeticIndex<V> setOverflowLabel(String overflowLabel) {
         this.overflowLabel = overflowLabel;
         return this;
     }

     /**
      * Get the default label used in the IndexCharacters' locale for underflow, eg the last item in: X Y Z ...
      *
      * @return underflow label
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public String getUnderflowLabel() {
         return underflowLabel; // TODO get localized version
     }


     /**
      * Set the underflowLabel label
      * @param underflowLabel see class description
      * @return this, for chaining
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public AlphabeticIndex<V> setUnderflowLabel(String underflowLabel) {
         this.underflowLabel = underflowLabel;
         return this;
     }

     /**
      * Get the default label used in the IndexCharacters' locale for overflow, eg the first item in: ... A B C
      *
      * @return overflow label
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public String getOverflowLabel() {
         return overflowLabel; // TODO get localized version
     }


     /**
      * Set the inflowLabel label
      * @param inflowLabel see class description
      * @return this, for chaining
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public AlphabeticIndex<V> setInflowLabel(String inflowLabel) {
         this.inflowLabel = inflowLabel;
         return this;
     }

     /**
      * Get the default label used for abbreviated buckets <i>between</i> other labels. For example, consider the labels
      * for Latin and Greek are used: X Y Z ... &#x0391; &#x0392; &#x0393;.
      *
      * @return inflow label
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public String getInflowLabel() {
         return inflowLabel; // TODO get localized version
     }


     /**
      * Get the limit on the number of labels in the index. The number of buckets can be slightly larger: see getBucketCount().
      *
      * @return maxLabelCount maximum number of labels.
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public int getMaxLabelCount() {
         return maxLabelCount;
     }

     /**
      * Set a limit on the number of labels in the index. The number of buckets can be slightly larger: see
      * getBucketCount().
      *
      * @return maxLabelCount label Set the maximum number of labels. Currently, if the number is exceeded, then every
      *         nth item is removed to bring the count down. A more sophisticated mechanism may be available in the
      *         future.
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public AlphabeticIndex<V> setMaxLabelCount(int maxLabelCount) {
         this.maxLabelCount = maxLabelCount;
         return this;
     }

     /**
      * Determine the best labels to use. This is based on the exemplars, but we also process to make sure that they are unique,
      * and sort differently, and that the overall list is small enough.
      * @return
      */
     private ArrayList<String> initLabels() {
         UnicodeSet exemplars = new UnicodeSet(initialLabels);

         // First sort them, with an "best" ordering among items that are the same according
         // to the collator.
         // Re the warning: the JDK inexplicably didn't make Collators be Comparator<String>!
         @SuppressWarnings("unchecked")
         Set<String> preferenceSorting = new TreeSet<String>(new MultiComparator<Object>(collatorPrimaryOnly, PREFERENCE_COMPARATOR));
         exemplars.addAllTo(preferenceSorting);

         TreeSet<String> indexCharacterSet = new TreeSet<String>(collatorPrimaryOnly);

         // We nw make a sorted array of elements
         // Some of the input may, however, be redundant.
         // That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
         // So we make a pass through, filtering out those cases.

         for (String item : preferenceSorting) {
             if (indexCharacterSet.contains(item)) {
                 for (String itemAlreadyIn : indexCharacterSet) {
                     if (collatorPrimaryOnly.compare(item, itemAlreadyIn) == 0) {
                         Set<String> targets = alreadyIn.get(itemAlreadyIn);
                         if (targets == null) {
                             alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>());
                         }
                         targets.add(item);
                         break;
                     }
                 }
             } else if (UTF16.countCodePoint(item) > 1 && collatorPrimaryOnly.compare(item, separated(item)) == 0) {
                 noDistinctSorting.add(item);
             } else if (!ALPHABETIC.containsSome(item)) {
                 notAlphabetic.add(item);
             } else {
                 indexCharacterSet.add(item);
             }
         }

         // if the result is still too large, cut down to maxCount elements, by removing every nth element

         final int size = indexCharacterSet.size() - 1;
         if (size > maxLabelCount) {
             int count = 0;
             int old = -1;
             for (Iterator<String> it = indexCharacterSet.iterator(); it.hasNext();) {
                 ++count;
                 it.next();
                 final int bump = count * maxLabelCount / size;
                 if (bump == old) {
                     it.remove();
                 } else {
                     old = bump;
                 }
             }
         }

         return new ArrayList<String>(indexCharacterSet);
     }

     /**
      * This method is called to get the index exemplars. Normally these come from the locale directly,
      * but if they aren't available, we have to synthesize them.
      * @param locale
      * @return
      */
     private static UnicodeSet getIndexExemplars(ULocale locale) {
         UnicodeSet exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_INDEX);

         if (exemplars != null) {
             return exemplars;
         }

         // Synthesize the index exemplars

         exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_STANDARD);

         // get the exemplars, and handle special cases

         exemplars = exemplars.cloneAsThawed();
         // question: should we add auxiliary exemplars?
         if (exemplars.containsSome(CORE_LATIN) || exemplars.size() == 0) {
             exemplars.addAll(CORE_LATIN);
         }
         if (exemplars.containsSome(HANGUL)) {
             // cut down to small list
             exemplars.removeAll(new UnicodeSet("[:block=hangul_syllables:]")).addAll(HANGUL);
         }
         if (exemplars.containsSome(ETHIOPIC)) {
             // cut down to small list
             // make use of the fact that Ethiopic is allocated in 8's, where
             // the base is 0 mod 8.
             for (UnicodeSetIterator it = new UnicodeSetIterator(ETHIOPIC); it.next();) {
                 if ((it.codepoint & 0x7) != 0) {
                     exemplars.remove(it.codepoint);
                 }
             }
         }

         UnicodeSet uppercased = new UnicodeSet();
         for (String item : exemplars) {
             uppercased.add(UCharacter.toUpperCase(locale, item));
         }

         return uppercased;
     }

     /**
      * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
      * <p>This is used to test whether contractions sort differently from their components.
      */
     private String separated(String item) {
         StringBuilder result = new StringBuilder();
         // add a CGJ except within surrogates
         char last = item.charAt(0);
         result.append(last);
         for (int i = 1; i < item.length(); ++i) {
             char ch = item.charAt(i);
             if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
                 result.append(CGJ);
             }
             result.append(ch);
             last = ch;
         }
         return result.toString();
     }

     /**
      * Get the labels.
      *
      * @return A collection listing the labels, after processing.
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public List<String> getBucketLabels() {
         if (buckets == null) {
             initBuckets();
         }
         ArrayList<String> result = new ArrayList<String>();
         for (Bucket<V> bucket : buckets) {
             result.add(bucket.getLabel());
         }
         return result;
     }

     /**
      * Get a clone of the collator used internally. Note that for performance reasons, the clone is only done once, and
      * then stored. The next time it is accessed, the same instance is returned.
      * <p>
      * <b><i>Don't use this method across threads if you are changing the settings on the collator, at least not without
      * synchronizing.</i></b>
      *
      * @return a clone of the collator used internally
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public RuleBasedCollator getCollator() {
         if (collatorExternal == null) {
             try {
                 collatorExternal = (RuleBasedCollator) (collatorOriginal.clone());
             } catch (Exception e) {
                 // should never happen
                 throw new IllegalStateException("Collator cannot be cloned", e);
             }
         }
         return collatorExternal;
     }

     /**
      * Add a record (name and data) to the index. The name will be used to sort the items into buckets, and to sort
      * within the bucket. Two records may have the same name. When they do, the sort order is according to the order added:
      * the first added comes first.
      *
      * @param name
      *            Name, such as a name
      * @param data
      *            Data, such as an address or link
      * @return this, for chaining
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public AlphabeticIndex<V> addRecord(CharSequence name, V data) {
         // TODO instead of invalidating, just add to unprocessed list.
         buckets = null; // invalidate old bucketlist
         inputList.add(new Record<V>(name, data, inputList.size()));
         return this;
     }

     /**
      * Get the bucket number for the given name. This routine permits callers to implement their own bucket handling
      * mechanisms, including client-server handling. For example, when a new name is created on the client, it can ask
      * the server for the bucket for that name, and the sortkey (using getCollator). Once the client has that
      * information, it can put the name into the right bucket, and sort it within that bucket, without having access to
      * the index or collator.
      * <p>
      * Note that the bucket number (and sort key) are only valid for the settings of the current AlphabeticIndex; if
      * those are changed, then the bucket number and sort key must be regenerated.
      *
      * @param name
      *            Name, such as a name
      * @return this, for chaining
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public int getBucketIndex(CharSequence name) {
         if (buckets == null) {
             initBuckets();
         }
         if (langType == LangType.SIMPLIFIED) {
             String hackPrefix = hackName(name, collatorPrimaryOnly);
             if (hackPrefix != null) {
                 name = hackPrefix + name;
             }
         }
         return rawGetBucketIndex(name);
     }

     private int rawGetBucketIndex(CharSequence name) {
         // TODO use a binary search
         int result = -1;
         for (Bucket<V> bucket : buckets) {
             if (bucket.lowerBoundary == null) { // last bucket
                 return result;
             }
             int comp = collatorPrimaryOnly.compare(name, bucket.lowerBoundary);
             if (comp < 0) { // the first boundary is always "", and so -1 will never be returned
                 return result;
             } else if (comp == 0) {
                 return result + 1;
             }
             result++;
         }
         return result;
     }

     /**
      * Clear the index.
      *
      * @return this, for chaining
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public AlphabeticIndex<V> clearRecords() {
         buckets = null;
         inputList.clear();
         return this;
     }

     /**
      * Return the number of buckets in the index. This will be the same as the number of labels, plus buckets for the underflow, overflow, and inflow(s).
      *
      * @return number of buckets
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public int getBucketCount() {
         if (buckets == null) {
             initBuckets();
         }
         return buckets.bucketList.size();
     }

     /**
      * Return the number of records in the index: that is, the total number of distinct <name,data> pairs added with addRecord(...), over all the buckets.
      *
      * @return total number of records in buckets
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public int getRecordCount() {
         return inputList.size();
     }

     /**
      * Return an iterator over the buckets.
      *
      * @return iterator over buckets.
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public Iterator<Bucket<V>> iterator() {
         if (buckets == null) {
             initBuckets();
         }
         return buckets.iterator();
     }

     /**
      * Convenience routine to bucket a list of input strings according to the index.<br>
      * Warning: if a UI suppresses buckets that are empty, this may result in the special buckets (underflow, overflow,
      * inflow) being adjacent. In that case, the application may want to combine them.
      *
      * @param inputList
      *            List of strings to be sorted and bucketed according to the labels.
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     private void initBuckets() {
         buckets = new BucketList();

         // Make a collator for records. Do this so that the Records can be static classes, and not know about the collators.
         // TODO make this a member of the class.
         Comparator<Record<V>> fullComparator = new Comparator<Record<V>>() {
             public int compare(Record<V> o1, Record<V> o2) {
                 int result = collatorOriginal.compare(o1.name, o2.name);
                 if (result != 0) {
                     return result;
                 }
                 return o1.counter - o2.counter;
             }
         };

         // If we have Pinyin, then we have a special hack to bucket items with ASCII.
         if (langType == LangType.SIMPLIFIED) {
             Map<String,Bucket<V>> rebucketMap = new HashMap<String, Bucket<V>>();
             for (Record<V> name : inputList) {
                 String key = hackName(name.name, collatorOriginal);
                 if (key == null) continue;
                 Bucket<V> bucket = rebucketMap.get(key);
                 if (bucket == null) {
                     int index = rawGetBucketIndex(key);
                     bucket = buckets.bucketList.get(index);
                 }
                 rebucketMap.put(key, bucket);
                 name.rebucket = bucket;
             }
         }

         // Set up a sorted list of the input
         TreeSet<Record<V>> sortedInput = new TreeSet<Record<V>>(fullComparator);
         sortedInput.addAll(inputList);

         // Now, we traverse all of the input, which is now sorted.
         // If the item doesn't go in the current bucket, we find the next bucket that contains it.
         // This makes the process order n*log(n), since we just sort the list and then do a linear process.
         // However, if the user adds item at a time and then gets the buckets, this isn't efficient, so
         // we need to improve it for that case.

         Iterator<Bucket<V>> bucketIterator = buckets.iterator();
         Bucket<V> currentBucket = bucketIterator.next();
         Bucket<V> nextBucket = bucketIterator.next();
         String upperBoundary = nextBucket.lowerBoundary; // there is always at least one bucket, so this is safe
         boolean atEnd = false;
         for (Record<V> s : sortedInput) {
             // special hack for pinyin
             if (s.rebucket != null) {
                 s.rebucket.records.add(s);
                 continue;
             }
             // if the current bucket isn't the right one, find the one that is
             // We have a special flag for the last bucket so that we don't look any further
             while (!atEnd && collatorPrimaryOnly.compare(s.name, upperBoundary) >= 0) {
                 currentBucket = nextBucket;
                 // now reset the boundary that we compare against
                 if (bucketIterator.hasNext()) {
                     nextBucket = bucketIterator.next();
                     upperBoundary = nextBucket.lowerBoundary;
                     if (upperBoundary == null) {
                         atEnd = true;
                     }
                 } else {
                     atEnd = true;
                 }
             }
             // now put the record into the bucket.
             currentBucket.records.add(s);
         }
     }

     /**
      * Get the Unicode character (or tailored string) that defines an overflow bucket; that is anything greater than or
      * equal to that string should go in that bucket, instead of with the last character. Normally that is the first
      * character of the script after lowerLimit. Thus in X Y Z ... <i>Devanagari-ka</i>, the overflow character for Z
      * would be the <i>Greek-alpha</i>.
      *
      * @param lowerLimit
      *            The character below the overflow (or inflow) bucket
      * @return string that defines top of the overflow buck for lowerLimit, or null if there is none
      * @internal
      * @deprecated This API is ICU internal only.
      */
     public String getOverflowComparisonString(String lowerLimit) {
         // TODO Use collator method instead of this hack
         for (String s : HACK_FIRST_CHARS_IN_SCRIPTS) {
             if (collatorPrimaryOnly.compare(s, lowerLimit) > 0) {
                 return s;
             }
         }
         return null;
     }

     /**
      * Return a list of the first character in each script, in collation order. Only exposed for testing.
      *
      * @return list of first characters in each script
      * @internal
      * @deprecated This API is ICU internal only.
      */
     public List<String> getFirstScriptCharacters() {
         return HACK_FIRST_CHARS_IN_SCRIPTS;
     }

     /**
      * As the index is built, strings may be discarded from the exemplars. This contains some of the discards, and is
      * intended for debugging.
      *
      * @internal
      * @deprecated This API is ICU internal only.
      */
     public Map<String, Set<String>> getAlreadyIn() {
         return alreadyIn;
     }

     /**
      * As the index is built, strings may be discarded from the exemplars. This contains some of the discards, and is
      * intended for debugging.
      *
      * @internal
      * @deprecated This API is ICU internal only.
      */
     public List<String> getNoDistinctSorting() {
         return noDistinctSorting;
     }

     /**
      * As the index is built, strings may be discarded from the exemplars. This contains some of the discards, and is
      * intended for debugging.
      *
      * @internal
      * @deprecated This API is ICU internal only.
      */
     public List<String> getNotAlphabetic() {
         return notAlphabetic;
     }

     private static UnicodeSet getScriptSet(String codePoint) {
         return new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, UScript.getScript(codePoint.codePointAt(0)));
     }

     private static final UnicodeSet IGNORE_SCRIPTS = new UnicodeSet(
     "[[:sc=Common:][:sc=inherited:][:script=Unknown:][:script=braille:]]").freeze();

     private static final PreferenceComparator PREFERENCE_COMPARATOR = new PreferenceComparator();
     private int maxLabelCount = 99;

     /**
      * Comparator that returns "better" strings first, where shorter NFKD is better, and otherwise NFKD binary order is
      * better, and otherwise binary order is better.
      */
     private static class PreferenceComparator implements Comparator<Object> {
         static final Comparator<String> binary = new UTF16.StringComparator(true, false, 0);

         public int compare(Object o1, Object o2) {
             return compare((String) o1, (String) o2);
         }

         public int compare(String s1, String s2) {
             if (s1 == s2) {
                 return 0;
             }
             String n1 = Normalizer.decompose(s1, true);
             String n2 = Normalizer.decompose(s2, true);
             int result = n1.length() - n2.length();
             if (result != 0) {
                 return result;
             }
             result = binary.compare(n1, n2);
             if (result != 0) {
                 return result;
             }
             return binary.compare(s1, s2);
         }
     }

     /**
      * A record to be sorted into buckets with getIndexBucketCharacters.
      *
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public static class Record<V> {
         private Bucket<V> rebucket = null; // special hack for Pinyin
         private CharSequence name;
         private V data;
         private int counter;

         private Record(CharSequence name, V data, int counter) {
             this.name = name;
             this.data = data;
             this.counter = counter;
         }

         /**
          * Get the name
          *
          * @return the name
          * @draft ICU 4.6
          * @provisional This API might change or be removed in a future release.
          */
         public CharSequence getName() {
             return name;
         }

         /**
          * Get the data
          *
          * @return the data
          * @draft ICU 4.6
          * @provisional This API might change or be removed in a future release.
          */
         public V getData() {
             return data;
         }

         /**
          * Standard toString()
          * @draft ICU 4.6
          * @provisional This API might change or be removed in a future release.
          */
         public String toString() {
             return name + "=" + data + (rebucket == null ? "" : "{" + rebucket.label + "}");
         }
     }

     /**
      * A "bucket", containing records sorted under an index string by getIndexBucketCharacters. Is created by the
      * addBucket method in BucketList. A typical implementation will provide methods getLabel(), getSpecial(), and
      * getValues().<br>
      * See com.ibm.icu.dev.test.collator.IndexCharactersTest for an example.
      *
      * @param <V>
      *            Data type
      * @draft ICU 4.6
      * @provisional This API might change or be removed in a future release.
      */
     public static class Bucket<V> implements Iterable<Record<V>> {
         private final String label;
         private final String lowerBoundary;
         private final LabelType labelType;
         private final List<Record<V>> records = new ArrayList<Record<V>>();

         /**
          * Type of the label
          *
          * @draft ICU 4.6
          * @provisional This API might change or be removed in a future release.
          */
         public enum LabelType {
             NORMAL, UNDERFLOW, INFLOW, OVERFLOW
         }

         /**
          * Set up the bucket.
          *
          * @param label
          *            label for the bucket
          * @param labelType
          *            is an underflow, overflow, or inflow bucket
          * @draft ICU 4.6
          * @provisional This API might change or be removed in a future release.
          */
         private Bucket(String label, String lowerBoundary, LabelType labelType) {
             this.label = label;
             this.lowerBoundary = lowerBoundary;
             this.labelType = labelType;
         }

         /**
          * Get the label
          *
          * @return label for the bucket
          * @draft ICU 4.6
          * @provisional This API might change or be removed in a future release.
          */
         public String getLabel() {
             return label;
         }

         /**
          * Is a normal, underflow, overflow, or inflow bucket
          *
          * @return is an underflow, overflow, or inflow bucket
          * @draft ICU 4.6
          * @provisional This API might change or be removed in a future release.
          */
         public LabelType getLabelType() {
             return labelType;
         }

         /**
          * Get the number of records in the bucket.
          *
          * @return number of records in bucket
          * @draft ICU 4.6
          * @provisional This API might change or be removed in a future release.
          */
         public int size() {
             return records.size();
         }

         /**
          * Iterator over the records in the bucket
          * @draft ICU 4.6
          * @provisional This API might change or be removed in a future release.
          */
         public Iterator<Record<V>> iterator() {
             return records.iterator();
         }

         /**
          * Standard toString()
          * @draft ICU 4.6
          * @provisional This API might change or be removed in a future release.
          */
         @Override
         public String toString() {
             return "{" +
             "labelType=" + labelType
             + ", " +
             "lowerBoundary=" + lowerBoundary
             + ", " +
             "label=" + label
             + "}"
             ;
         }
     }

     private class BucketList implements Iterable<Bucket<V>> {
         private ArrayList<Bucket<V>> bucketList = new ArrayList<Bucket<V>>();

         BucketList() {
             // initialize indexCharacters;
             List<String> indexCharacters = initLabels();

             // underflow bucket
             bucketList.add(new Bucket<V>(getUnderflowLabel(), "", Bucket.LabelType.UNDERFLOW));

             // fix up the list, adding underflow, additions, overflow
             // insert infix labels as needed, using \uFFFF.
             String last = indexCharacters.get(0);
             bucketList.add(new Bucket<V>(last, last, Bucket.LabelType.NORMAL));
             UnicodeSet lastSet = getScriptSet(last).removeAll(IGNORE_SCRIPTS);

             for (int i = 1; i < indexCharacters.size(); ++i) {
                 String current = indexCharacters.get(i);
                 UnicodeSet set = getScriptSet(current).removeAll(IGNORE_SCRIPTS);
                 if (lastSet.containsNone(set)) {
                     // check for adjacent
                     String overflowComparisonString = getOverflowComparisonString(last);
                     if (collatorPrimaryOnly.compare(overflowComparisonString, current) < 0) {
                         bucketList.add(new Bucket<V>(getInflowLabel(), overflowComparisonString,
                                 Bucket.LabelType.INFLOW));
                         i++;
                         lastSet = set;
                     }
                 }
                 bucketList.add(new Bucket<V>(current, current, Bucket.LabelType.NORMAL));
                 last = current;
                 lastSet = set;
             }
             // overflow bucket
             String limitString = getOverflowComparisonString(last);
             bucketList.add(new Bucket<V>(getOverflowLabel(), limitString, Bucket.LabelType.OVERFLOW)); // final,

         }

         public Iterator<Bucket<V>> iterator() {
             return bucketList.iterator();
         }
     }

     /*
      * HACKS
      */

     /**
      * Only gets called for simplified Chinese. Uses further hack to distinguish long from short pinyin table.
      */
     private String hackName(CharSequence name, RuleBasedCollator comparator) {
         if (!UNIHAN.contains(Character.codePointAt(name, 0))) {
             return null;
         }
         synchronized (PINYIN_LOWER_BOUNDS_LONG) {
             if (PINYIN_LOWER_BOUNDS == null) {
                 if (comparator.getTailoredSet().contains(probeCharInLong)) {
                     PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_LONG;
                     HACK_PINYIN_LOOKUP = HACK_PINYIN_LOOKUP_LONG;
                 } else {
                     PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_SHORT;
                     HACK_PINYIN_LOOKUP = HACK_PINYIN_LOOKUP_SHORT;
                 }
             }
         }
         int index = Arrays.binarySearch(HACK_PINYIN_LOOKUP, name, comparator);
         if (index < 0) {
             index = -index - 2;
         }
         return PINYIN_LOWER_BOUNDS.substring(index, index + 1);
     }

     private static String PINYIN_LOWER_BOUNDS;

     private static String[] HACK_PINYIN_LOOKUP;


     /**
      * HACKS
      * Generated with org.unicode.draft.GenerateUnihanCollator.
      */

     private int probeCharInLong = 0x28EAD;

     private static String PINYIN_LOWER_BOUNDS_LONG = "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz";

     private static String[] HACK_PINYIN_LOOKUP_LONG = {
         "", // A
         "\u516B", // b : \u516B [b\u0101]
         "\uD863\uDEAD", // c : \U00028EAD [c\u0101]
         "\uD844\uDE51", // d : \U00021251 [d\u0101]
         "\u59B8", // e : \u59B8 [\u0113]
         "\u53D1", // f : \u53D1 [f\u0101]
         "\uD844\uDE45", // g : \U00021245 [g\u0101]
         "\u54C8", // h : \u54C8 [h\u0101]
         "\u4E0C", // j : \u4E0C [j\u012B]
         "\u5494", // k : \u5494 [k\u0101]
         "\u3547", // l : \u3547 [l\u0101]
         "\u5452", // m : \u5452 [\u1E3F]
         "\u5514", // n : \u5514 [\u0144]
         "\u5594", // o : \u5594 [\u014D]
         "\uD84F\uDC7A", // p : \U00023C7A [p\u0101]
         "\u4E03", // q : \u4E03 [q\u012B]
         "\u513F", // r : \u513F [r]
         "\u4EE8", // s : \u4EE8 [s\u0101]
         "\u4ED6", // t : \u4ED6 [t\u0101]
         "\u7A75", // w : \u7A75 [w\u0101]
         "\u5915", // x : \u5915 [x\u012B]
         "\u4E2B", // y : \u4E2B [y\u0101]
         "\u5E00", // z : \u5E00 [z\u0101]
     };

     private static String PINYIN_LOWER_BOUNDS_SHORT = "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz";

     private static String[] HACK_PINYIN_LOOKUP_SHORT = {
         "", // A
         "\u516B", // b : \u516B [b\u0101]
         "\u5693", // c : \u5693 [c\u0101]
         "\u5491", // d : \u5491 [d\u0101]
         "\u59B8", // e : \u59B8 [\u0113]
         "\u53D1", // f : \u53D1 [f\u0101]
         "\u65EE", // g : \u65EE [g\u0101]
         "\u54C8", // h : \u54C8 [h\u0101]
         "\u4E0C", // j : \u4E0C [j\u012B]
         "\u5494", // k : \u5494 [k\u0101]
         "\u3547", // l : \u3547 [l\u0101]
         "\u5452", // m : \u5452 [\u1E3F]
         "\u5514", // n : \u5514 [\u0144]
         "\u5594", // o : \u5594 [\u014D]
         "\u5991", // p : \u5991 [p\u0101]
         "\u4E03", // q : \u4E03 [q\u012B]
         "\u513F", // r : \u513F [r]
         "\u4EE8", // s : \u4EE8 [s\u0101]
         "\u4ED6", // t : \u4ED6 [t\u0101]
         "\u7A75", // w : \u7A75 [w\u0101]
         "\u5915", // x : \u5915 [x\u012B]
         "\u4E2B", // y : \u4E2B [y\u0101]
         "\u5E00", // z : \u5E00 [z\u0101]
     };

     /**
      * HACKS
      */
     private static final List<String> HACK_FIRST_CHARS_IN_SCRIPTS =
         Arrays.asList(new String[] {
                 "a", "\u03B1", "\u2C81", "\u0430", "\u2C30", "\u10D0", "\u0561", "\u05D0", "\uD802\uDD00", "\u0800", "\u0621",
                 "\u0710",  // Syriac
                 "\u0840",  // Mandaic
                 "\u0780", "\u07CA", "\u2D30", "\u1200", "\u0950", "\u0985", "\u0A74", "\u0AD0", "\u0B05", "\u0BD0",
                 "\u0C05", "\u0C85", "\u0D05", "\u0D85", "\uABC0", "\uA800", "\uA882", "\uD804\uDC83",
                 "\u1B83",  // Sundanese
                 "\uD804\uDC05",  // Brahmi (U+11005)
                 "\uD802\uDE00", "\u0E01", "\u0E81", "\uAA80", "\u0F40", "\u1C00", "\uA840", "\u1900", "\u1700", "\u1720", "\u1740", "\u1760",
                 "\u1A00",  // Buginese
                 "\u1BC0",  // Batak
                 "\uA930", "\uA90A", "\u1000", "\u1780", "\u1950", "\u1980", "\u1A20", "\uAA00", "\u1B05", "\uA984", "\u1880", "\u1C5A", "\u13A0", "\u1401", "\u1681", "\u16A0", "\uD803\uDC00", "\uA500", "\uA6A0", "\u1100",
                 "\u3041", "\u30A1", "\u3105", "\uA000", "\uA4F8", "\uD800\uDE80", "\uD800\uDEA0", "\uD802\uDD20", "\uD800\uDF00", "\uD800\uDF30", "\uD801\uDC28", "\uD801\uDC50", "\uD801\uDC80", "\uD800\uDC00", "\uD802\uDC00", "\uD802\uDE60", "\uD802\uDF00", "\uD802\uDC40",
                 "\uD802\uDF40", "\uD802\uDF60", "\uD800\uDF80", "\uD800\uDFA0", "\uD808\uDC00", "\uD80C\uDC00", "\u4E00"
         });

     /**
      * Only for testing...
      * @internal
      * @deprecated only for internal testing
      */
     public static List<String> getFirstCharactersInScripts() {
         return HACK_FIRST_CHARS_IN_SCRIPTS;
     }
 }