main/classes/core/src/com/ibm/icu/impl/LocaleIDParser.java - external/github.com/unicode-org/icu - Git at Google

 /*
 ******************************************************************************
 * Copyright (C) 2003-2009, International Business Machines Corporation and   *
 * others. All Rights Reserved.                                               *
 ******************************************************************************
 */

 package com.ibm.icu.impl;

 import java.util.Collections;
 import java.util.Comparator;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.TreeMap;

 import com.ibm.icu.impl.locale.AsciiUtil;

 /**
  * Utility class to parse and normalize locale ids (including POSIX style)
  */
 public final class LocaleIDParser {
     private char[] id;
     private int index;
     private char[] buffer;
     private int blen;
     // um, don't handle POSIX ids unless we request it.  why not?  well... because.
     private boolean canonicalize;
     private boolean hadCountry;

     // used when canonicalizing
     Map<String, String> keywords;
     String baseName;

     /**
      * Parsing constants.
      */
     private static final char KEYWORD_SEPARATOR     = '@';
     private static final char HYPHEN                = '-';
     private static final char KEYWORD_ASSIGN        = '=';
     private static final char COMMA                 = ',';
     private static final char ITEM_SEPARATOR        = ';';
     private static final char DOT                   = '.';
     private static final char UNDERSCORE            = '_';

     public LocaleIDParser(String localeID) {
         this(localeID, false);
     }

     public LocaleIDParser(String localeID, boolean canonicalize) {
         id = localeID.toCharArray();
         index = 0;
         buffer = new char[id.length + 5];
         blen = 0;
         this.canonicalize = canonicalize;
     }

     private void reset() {
         index = blen = 0;
     }

     // utilities for working on text in the buffer

     /**
      * Append c to the buffer.
      */
     private void append(char c) {
         try {
             buffer[blen] = c;
         }
         catch (IndexOutOfBoundsException e) {
             if (buffer.length > 512) {
                 // something is seriously wrong, let this go
                 throw e;
             }
             char[] nbuffer = new char[buffer.length * 2];
             System.arraycopy(buffer, 0, nbuffer, 0, buffer.length);
             nbuffer[blen] = c;
             buffer = nbuffer;
         }
         ++blen;
     }

     private void addSeparator() {
         append(UNDERSCORE);
     }

     /**
      * Returns the text in the buffer from start to blen as a String.
      */
     private String getString(int start) {
         if (start == blen) {
             return "";
         }
         return new String(buffer, start, blen-start);
     }

     /**
      * Set the length of the buffer to pos, then append the string.
      */
     private void set(int pos, String s) {
         this.blen = pos; // no safety
         append(s);
     }

     /**
      * Append the string to the buffer.
      */
     private void append(String s) {
         for (int i = 0; i < s.length(); ++i) {
             append(s.charAt(i));
         }
     }

     // utilities for parsing text out of the id

     /**
      * Character to indicate no more text is available in the id.
      */
     private static final char DONE = '\uffff';

     /**
      * Returns the character at index in the id, and advance index.  The returned character
      * is DONE if index was at the limit of the buffer.  The index is advanced regardless
      * so that decrementing the index will always 'unget' the last character returned.
      */
     private char next() {
         if (index == id.length) {
             index++;
             return DONE;
         }

         return id[index++];
     }

     /**
      * Advance index until the next terminator or id separator, and leave it there.
      */
     private void skipUntilTerminatorOrIDSeparator() {
         while (!isTerminatorOrIDSeparator(next())) {
         }
         --index;
     }

     /**
      * Returns true if the character at index in the id is a terminator.
      */
     private boolean atTerminator() {
         return index >= id.length || isTerminator(id[index]);
     }

     /*
      * Returns true if the character is an id separator (underscore or hyphen).
      */
     /*        private boolean isIDSeparator(char c) {
             return c == UNDERSCORE || c == HYPHEN;
         }*/

     /**
      * Returns true if the character is a terminator (keyword separator, dot, or DONE).
      * Dot is a terminator because of the POSIX form, where dot precedes the codepage.
      */
     private boolean isTerminator(char c) {
         // always terminate at DOT, even if not handling POSIX.  It's an error...
         return c == KEYWORD_SEPARATOR || c == DONE || c == DOT;
     }

     /**
      * Returns true if the character is a terminator or id separator.
      */
     private boolean isTerminatorOrIDSeparator(char c) {
         return c == KEYWORD_SEPARATOR || c == UNDERSCORE || c == HYPHEN ||
         c == DONE || c == DOT;
     }

     /**
      * Returns true if the start of the buffer has an experimental or private language
      * prefix, the pattern '[ixIX][-_].' shows the syntax checked.
      */
     private boolean haveExperimentalLanguagePrefix() {
         if (id.length > 2) {
             char c = id[1];
             if (c == HYPHEN || c == UNDERSCORE) {
                 c = id[0];
                 return c == 'x' || c == 'X' || c == 'i' || c == 'I';
             }
         }
         return false;
     }

     /**
      * Returns true if a value separator occurs at or after index.
      */
     private boolean haveKeywordAssign() {
         // assume it is safe to start from index
         for (int i = index; i < id.length; ++i) {
             if (id[i] == KEYWORD_ASSIGN) {
                 return true;
             }
         }
         return false;
     }

     /**
      * Advance index past language, and accumulate normalized language code in buffer.
      * Index must be at 0 when this is called.  Index is left at a terminator or id
      * separator.  Returns the start of the language code in the buffer.
      */
     private int parseLanguage() {
         if (haveExperimentalLanguagePrefix()) {
             append(Character.toLowerCase(id[0]));
             append(HYPHEN);
             index = 2;
         }

         char c;
         while(!isTerminatorOrIDSeparator(c = next())) {
             append(Character.toLowerCase(c));
         }
         --index; // unget

         if (blen == 3) {
             String lang = LocaleIDs.threeToTwoLetterLanguage(getString(0));
             if (lang != null) {
                 set(0, lang);
             }
         }

         return 0;
     }

     /**
      * Advance index past language.  Index must be at 0 when this is called.  Index
      * is left at a terminator or id separator.
      */
     private void skipLanguage() {
         if (haveExperimentalLanguagePrefix()) {
             index = 2;
         }
         skipUntilTerminatorOrIDSeparator();
     }

     /**
      * Advance index past script, and accumulate normalized script in buffer.
      * Index must be immediately after the language.
      * If the item at this position is not a script (is not four characters
      * long) leave index and buffer unchanged.  Otherwise index is left at
      * a terminator or id separator.  Returns the start of the script code
      * in the buffer (this may be equal to the buffer length, if there is no
      * script).
      */
     private int parseScript() {
         if (!atTerminator()) {
             int oldIndex = index; // save original index
             ++index;

             int oldBlen = blen; // get before append hyphen, if we truncate everything is undone
             char c;
             while(!isTerminatorOrIDSeparator(c = next())) {
                 if (blen == oldBlen) { // first pass
                     addSeparator();
                     append(Character.toUpperCase(c));
                 } else {
                     append(Character.toLowerCase(c));
                 }
             }
             --index; // unget

             /* If it's not exactly 4 characters long, then it's not a script. */
             if (index - oldIndex != 5) { // +1 to account for separator
                 index = oldIndex;
                 blen = oldBlen;
             } else {
                 oldBlen++; // index past hyphen, for clients who want to extract just the script
             }

             return oldBlen;
         }
         return blen;
     }

     /**
      * Advance index past script.
      * Index must be immediately after the language and IDSeparator.
      * If the item at this position is not a script (is not four characters
      * long) leave index.  Otherwise index is left at a terminator or
      * id separator.
      */
     private void skipScript() {
         if (!atTerminator()) {
             int oldIndex = index;
             ++index;

             skipUntilTerminatorOrIDSeparator();
             if (index - oldIndex != 5) { // +1 to account for separator
                 index = oldIndex;
             }
         }
     }

     /**
      * Advance index past country, and accumulate normalized country in buffer.
      * Index must be immediately after the script (if there is one, else language)
      * and IDSeparator.  Return the start of the country code in the buffer.
      */
     private int parseCountry() {
         if (!atTerminator()) {
             int oldIndex = index;
             ++index;

             int oldBlen = blen;
             char c;
             while (!isTerminatorOrIDSeparator(c = next())) {
                 if (oldBlen == blen) { // first, add hyphen
                     hadCountry = true; // we have a country, let variant parsing know
                     addSeparator();
                     ++oldBlen; // increment past hyphen
                 }
                 append(Character.toUpperCase(c));
             }
             --index; // unget

             int charsAppended = blen - oldBlen;

             if (charsAppended == 0) {
                 // Do nothing.
             }
             else if (charsAppended < 2 || charsAppended > 3) {
                 // It's not a country, so return index and blen to
                 // their previous values.
                 index = oldIndex;
                 --oldBlen;
                 blen = oldBlen;
                 hadCountry = false;
             }
             else if (charsAppended == 3) {
                 String region = LocaleIDs.threeToTwoLetterRegion(getString(oldBlen));
                 if (region != null) {
                     set(oldBlen, region);
                 }
             }

             return oldBlen;
         }

         return blen;
     }

     /**
      * Advance index past country.
      * Index must be immediately after the script (if there is one, else language)
      * and IDSeparator.
      */
     private void skipCountry() {
         if (!atTerminator()) {
             ++index;
             /*
              * Save the index point after the separator, since the format
              * requires two separators if the country is not present.
              */
             int oldIndex = index;

             skipUntilTerminatorOrIDSeparator();
             int charsSkipped = index - oldIndex;
             if (charsSkipped < 2 || charsSkipped > 3) {
                 index = oldIndex;
             }
         }
     }

     /**
      * Advance index past variant, and accumulate normalized variant in buffer.  This ignores
      * the codepage information from POSIX ids.  Index must be immediately after the country
      * or script.  Index is left at the keyword separator or at the end of the text.  Return
      * the start of the variant code in the buffer.
      *
      * In standard form, we can have the following forms:
      * ll__VVVV
      * ll_CC_VVVV
      * ll_Ssss_VVVV
      * ll_Ssss_CC_VVVV
      *
      * This also handles POSIX ids, which can have the following forms (pppp is code page id):
      * ll_CC.pppp          --> ll_CC
      * ll_CC.pppp@VVVV     --> ll_CC_VVVV
      * ll_CC@VVVV          --> ll_CC_VVVV
      *
      * We identify this use of '@' in POSIX ids by looking for an '=' following
      * the '@'.  If there is one, we consider '@' to start a keyword list, instead of
      * being part of a POSIX id.
      *
      * Note:  since it was decided that we want an option to not handle POSIX ids, this
      * becomes a bit more complex.
      */
     private int parseVariant() {
         int oldBlen = blen;

         boolean start = true;
         boolean needSeparator = true;
         boolean skipping = false;
         char c;
         while ((c = next()) != DONE) {
             if (c == DOT) {
                 start = false;
                 skipping = true;
             } else if (c == KEYWORD_SEPARATOR) {
                 if (haveKeywordAssign()) {
                     break;
                 }
                 skipping = false;
                 start = false;
                 needSeparator = true; // add another underscore if we have more text
             } else if (start) {
                 start = false;
             } else if (!skipping) {
                 if (needSeparator) {
                     boolean incOldBlen = blen == oldBlen; // need to skip separators
                     needSeparator = false;
                     if (incOldBlen && !hadCountry) { // no country, we'll need two
                         addSeparator();
                         ++oldBlen; // for sure
                     }
                     addSeparator();
                     if (incOldBlen) { // only for the first separator
                         ++oldBlen;
                     }
                 }
                 c = Character.toUpperCase(c);
                 if (c == HYPHEN || c == COMMA) {
                     c = UNDERSCORE;
                 }
                 append(c);
             }
         }
         --index; // unget

         return oldBlen;
     }

     // no need for skipvariant, to get the keywords we'll just scan directly for
     // the keyword separator

     /**
      * Returns the normalized language id, or the empty string.
      */
     public String getLanguage() {
         reset();
         return getString(parseLanguage());
     }

     /**
      * Returns the normalized script id, or the empty string.
      */
     public String getScript() {
         reset();
         skipLanguage();
         return getString(parseScript());
     }

     /**
      * return the normalized country id, or the empty string.
      */
     public String getCountry() {
         reset();
         skipLanguage();
         skipScript();
         return getString(parseCountry());
     }

     /**
      * Returns the normalized variant id, or the empty string.
      */
     public String getVariant() {
         reset();
         skipLanguage();
         skipScript();
         skipCountry();
         return getString(parseVariant());
     }

     /**
      * Returns the language, script, country, and variant as separate strings.
      */
     public String[] getLanguageScriptCountryVariant() {
         reset();
         return new String[] {
                 getString(parseLanguage()),
                 getString(parseScript()),
                 getString(parseCountry()),
                 getString(parseVariant())
         };
     }

     public void setBaseName(String baseName) {
         this.baseName = baseName;
     }

     public void parseBaseName() {
         if (baseName != null) {
             set(0, baseName);
         } else {
             reset();
             parseLanguage();
             parseScript();
             parseCountry();
             parseVariant();

             // catch unwanted trailing underscore after country if there was no variant
             if (blen > 1 && buffer[blen-1] == UNDERSCORE) {
                 --blen;
             }
         }
     }

     /**
      * Returns the normalized base form of the locale id.  The base
      * form does not include keywords.
      */
     public String getBaseName() {
         if (baseName != null) {
             return baseName;
         }
         parseBaseName();
         return getString(0);
     }

     /**
      * Returns the normalized full form of the locale id.  The full
      * form includes keywords if they are present.
      */
     public String getName() {
         parseBaseName();
         parseKeywords();
         return getString(0);
     }

     // keyword utilities

     /**
      * If we have keywords, advance index to the start of the keywords and return true,
      * otherwise return false.
      */
     private boolean setToKeywordStart() {
         for (int i = index; i < id.length; ++i) {
             if (id[i] == KEYWORD_SEPARATOR) {
                 if (canonicalize) {
                     for (int j = ++i; j < id.length; ++j) { // increment i past separator for return
                         if (id[j] == KEYWORD_ASSIGN) {
                             index = i;
                             return true;
                         }
                     }
                 } else {
                     if (++i < id.length) {
                         index = i;
                         return true;
                     }
                 }
                 break;
             }
         }
         return false;
     }

     private static boolean isDoneOrKeywordAssign(char c) {
         return c == DONE || c == KEYWORD_ASSIGN;
     }

     private static boolean isDoneOrItemSeparator(char c) {
         return c == DONE || c == ITEM_SEPARATOR;
     }

     private String getKeyword() {
         int start = index;
         while (!isDoneOrKeywordAssign(next())) {
         }
         --index;
         return AsciiUtil.toLowerString(new String(id, start, index-start).trim());
     }

     private String getValue() {
         int start = index;
         while (!isDoneOrItemSeparator(next())) {
         }
         --index;
         return new String(id, start, index-start).trim(); // leave case alone
     }

     private Comparator<String> getKeyComparator() {
         final Comparator<String> comp = new Comparator<String>() {
             public int compare(String lhs, String rhs) {
                 return lhs.compareTo(rhs);
             }
         };
         return comp;
     }

     /**
      * Returns a map of the keywords and values, or null if there are none.
      */
     public Map<String, String> getKeywordMap() {
         if (keywords == null) {
             TreeMap<String, String> m = null;
             if (setToKeywordStart()) {
                 // trim spaces and convert to lower case, both keywords and values.
                 do {
                     String key = getKeyword();
                     if (key.length() == 0) {
                         break;
                     }
                     char c = next();
                     if (c != KEYWORD_ASSIGN) {
                         // throw new IllegalArgumentException("key '" + key + "' missing a value.");
                         if (c == DONE) {
                             break;
                         } else {
                             continue;
                         }
                     }
                     String value = getValue();
                     if (value.length() == 0) {
                         // throw new IllegalArgumentException("key '" + key + "' missing a value.");
                         continue;
                     }
                     if (m == null) {
                         m = new TreeMap<String, String>(getKeyComparator());
                     } else if (m.containsKey(key)) {
                         // throw new IllegalArgumentException("key '" + key + "' already has a value.");
                         continue;
                     }
                     m.put(key, value);
                 } while (next() == ITEM_SEPARATOR);
             }
             keywords = m != null ? m : Collections.<String, String>emptyMap();
         }

         return keywords;
     }


     /**
      * Parse the keywords and return start of the string in the buffer.
      */
     private int parseKeywords() {
         int oldBlen = blen;
         Map<String, String> m = getKeywordMap();
         if (!m.isEmpty()) {
             boolean first = true;
             for (Map.Entry<String, String> e : m.entrySet()) {
                 append(first ? KEYWORD_SEPARATOR : ITEM_SEPARATOR);
                 first = false;
                 append(e.getKey());
                 append(KEYWORD_ASSIGN);
                 append(e.getValue());
             }
             if (blen != oldBlen) {
                 ++oldBlen;
             }
         }
         return oldBlen;
     }

     /**
      * Returns an iterator over the keywords, or null if we have an empty map.
      */
     public Iterator<String> getKeywords() {
         Map<String, String> m = getKeywordMap();
         return m.isEmpty() ? null : m.keySet().iterator();
     }

     /**
      * Returns the value for the named keyword, or null if the keyword is not
      * present.
      */
     public String getKeywordValue(String keywordName) {
         Map<String, String> m = getKeywordMap();
         return m.isEmpty() ? null : m.get(AsciiUtil.toLowerString(keywordName.trim()));
     }

     /**
      * Set the keyword value only if it is not already set to something else.
      */
     public void defaultKeywordValue(String keywordName, String value) {
         setKeywordValue(keywordName, value, false);
     }

     /**
      * Set the value for the named keyword, or unset it if value is null.  If
      * keywordName itself is null, unset all keywords.  If keywordName is not null,
      * value must not be null.
      */
     public void setKeywordValue(String keywordName, String value) {
         setKeywordValue(keywordName, value, true);
     }

     /**
      * Set the value for the named keyword, or unset it if value is null.  If
      * keywordName itself is null, unset all keywords.  If keywordName is not null,
      * value must not be null.  If reset is true, ignore any previous value for
      * the keyword, otherwise do not change the keyword (including removal of
      * one or all keywords).
      */
     private void setKeywordValue(String keywordName, String value, boolean reset) {
         if (keywordName == null) {
             if (reset) {
                 // force new map, ignore value
                 keywords = Collections.<String, String>emptyMap();
             }
         } else {
             keywordName = AsciiUtil.toLowerString(keywordName.trim());
             if (keywordName.length() == 0) {
                 throw new IllegalArgumentException("keyword must not be empty");
             }
             if (value != null) {
                 value = value.trim();
                 if (value.length() == 0) {
                     throw new IllegalArgumentException("value must not be empty");
                 }
             }
             Map<String, String> m = getKeywordMap();
             if (m.isEmpty()) { // it is EMPTY_MAP
                 if (value != null) {
                     // force new map
                     keywords = new TreeMap<String, String>(getKeyComparator());
                     keywords.put(keywordName, value.trim());
                 }
             } else {
                 if (reset || !m.containsKey(keywordName)) {
                     if (value != null) {
                         m.put(keywordName, value);
                     } else {
                         m.remove(keywordName);
                         if (m.isEmpty()) {
                             // force new map
                             keywords = Collections.<String, String>emptyMap();
                         }
                     }
                 }
             }
         }
     }
 }
	/*
	******************************************************************************
	* Copyright (C) 2003-2009, International Business Machines Corporation and *
	* others. All Rights Reserved. *
	******************************************************************************
	*/

	package com.ibm.icu.impl;

	import java.util.Collections;
	import java.util.Comparator;
	import java.util.Iterator;
	import java.util.Map;
	import java.util.TreeMap;

	import com.ibm.icu.impl.locale.AsciiUtil;

	/**
	* Utility class to parse and normalize locale ids (including POSIX style)
	*/
	public final class LocaleIDParser {
	private char[] id;
	private int index;
	private char[] buffer;
	private int blen;
	// um, don't handle POSIX ids unless we request it. why not? well... because.
	private boolean canonicalize;
	private boolean hadCountry;

	// used when canonicalizing
	Map<String, String> keywords;
	String baseName;

	/**
	* Parsing constants.
	*/
	private static final char KEYWORD_SEPARATOR = '@';
	private static final char HYPHEN = '-';
	private static final char KEYWORD_ASSIGN = '=';
	private static final char COMMA = ',';
	private static final char ITEM_SEPARATOR = ';';
	private static final char DOT = '.';
	private static final char UNDERSCORE = '_';

	public LocaleIDParser(String localeID) {
	this(localeID, false);
	}

	public LocaleIDParser(String localeID, boolean canonicalize) {
	id = localeID.toCharArray();
	index = 0;
	buffer = new char[id.length + 5];
	blen = 0;
	this.canonicalize = canonicalize;
	}

	private void reset() {
	index = blen = 0;
	}

	// utilities for working on text in the buffer

	/**
	* Append c to the buffer.
	*/
	private void append(char c) {
	try {
	buffer[blen] = c;
	}
	catch (IndexOutOfBoundsException e) {
	if (buffer.length > 512) {
	// something is seriously wrong, let this go
	throw e;
	}
	char[] nbuffer = new char[buffer.length * 2];
	System.arraycopy(buffer, 0, nbuffer, 0, buffer.length);
	nbuffer[blen] = c;
	buffer = nbuffer;
	}
	++blen;
	}

	private void addSeparator() {
	append(UNDERSCORE);
	}

	/**
	* Returns the text in the buffer from start to blen as a String.
	*/
	private String getString(int start) {
	if (start == blen) {
	return "";
	}
	return new String(buffer, start, blen-start);
	}

	/**
	* Set the length of the buffer to pos, then append the string.
	*/
	private void set(int pos, String s) {
	this.blen = pos; // no safety
	append(s);
	}

	/**
	* Append the string to the buffer.
	*/
	private void append(String s) {
	for (int i = 0; i < s.length(); ++i) {
	append(s.charAt(i));
	}
	}

	// utilities for parsing text out of the id

	/**
	* Character to indicate no more text is available in the id.
	*/
	private static final char DONE = '\uffff';

	/**
	* Returns the character at index in the id, and advance index. The returned character
	* is DONE if index was at the limit of the buffer. The index is advanced regardless
	* so that decrementing the index will always 'unget' the last character returned.
	*/
	private char next() {
	if (index == id.length) {
	index++;
	return DONE;
	}

	return id[index++];
	}

	/**
	* Advance index until the next terminator or id separator, and leave it there.
	*/
	private void skipUntilTerminatorOrIDSeparator() {
	while (!isTerminatorOrIDSeparator(next())) {
	}
	--index;
	}

	/**
	* Returns true if the character at index in the id is a terminator.
	*/
	private boolean atTerminator() {
	return index >= id.length \|\| isTerminator(id[index]);
	}

	/*
	* Returns true if the character is an id separator (underscore or hyphen).
	*/
	/* private boolean isIDSeparator(char c) {
	return c == UNDERSCORE \|\| c == HYPHEN;
	}*/

	/**
	* Returns true if the character is a terminator (keyword separator, dot, or DONE).
	* Dot is a terminator because of the POSIX form, where dot precedes the codepage.
	*/
	private boolean isTerminator(char c) {
	// always terminate at DOT, even if not handling POSIX. It's an error...
	return c == KEYWORD_SEPARATOR \|\| c == DONE \|\| c == DOT;
	}

	/**
	* Returns true if the character is a terminator or id separator.
	*/
	private boolean isTerminatorOrIDSeparator(char c) {
	return c == KEYWORD_SEPARATOR \|\| c == UNDERSCORE \|\| c == HYPHEN \|\|
	c == DONE \|\| c == DOT;
	}

	/**
	* Returns true if the start of the buffer has an experimental or private language
	* prefix, the pattern '[ixIX][-_].' shows the syntax checked.
	*/
	private boolean haveExperimentalLanguagePrefix() {
	if (id.length > 2) {
	char c = id[1];
	if (c == HYPHEN \|\| c == UNDERSCORE) {
	c = id[0];
	return c == 'x' \|\| c == 'X' \|\| c == 'i' \|\| c == 'I';
	}
	}
	return false;
	}

	/**
	* Returns true if a value separator occurs at or after index.
	*/
	private boolean haveKeywordAssign() {
	// assume it is safe to start from index
	for (int i = index; i < id.length; ++i) {
	if (id[i] == KEYWORD_ASSIGN) {
	return true;
	}
	}
	return false;
	}

	/**
	* Advance index past language, and accumulate normalized language code in buffer.
	* Index must be at 0 when this is called. Index is left at a terminator or id
	* separator. Returns the start of the language code in the buffer.
	*/
	private int parseLanguage() {
	if (haveExperimentalLanguagePrefix()) {
	append(Character.toLowerCase(id[0]));
	append(HYPHEN);
	index = 2;
	}

	char c;
	while(!isTerminatorOrIDSeparator(c = next())) {
	append(Character.toLowerCase(c));
	}
	--index; // unget

	if (blen == 3) {
	String lang = LocaleIDs.threeToTwoLetterLanguage(getString(0));
	if (lang != null) {
	set(0, lang);
	}
	}

	return 0;
	}

	/**
	* Advance index past language. Index must be at 0 when this is called. Index
	* is left at a terminator or id separator.
	*/
	private void skipLanguage() {
	if (haveExperimentalLanguagePrefix()) {
	index = 2;
	}
	skipUntilTerminatorOrIDSeparator();
	}

	/**
	* Advance index past script, and accumulate normalized script in buffer.
	* Index must be immediately after the language.
	* If the item at this position is not a script (is not four characters
	* long) leave index and buffer unchanged. Otherwise index is left at
	* a terminator or id separator. Returns the start of the script code
	* in the buffer (this may be equal to the buffer length, if there is no
	* script).
	*/
	private int parseScript() {
	if (!atTerminator()) {
	int oldIndex = index; // save original index
	++index;

	int oldBlen = blen; // get before append hyphen, if we truncate everything is undone
	char c;
	while(!isTerminatorOrIDSeparator(c = next())) {
	if (blen == oldBlen) { // first pass
	addSeparator();
	append(Character.toUpperCase(c));
	} else {
	append(Character.toLowerCase(c));
	}
	}
	--index; // unget

	/* If it's not exactly 4 characters long, then it's not a script. */
	if (index - oldIndex != 5) { // +1 to account for separator
	index = oldIndex;
	blen = oldBlen;
	} else {
	oldBlen++; // index past hyphen, for clients who want to extract just the script
	}

	return oldBlen;
	}
	return blen;
	}

	/**
	* Advance index past script.
	* Index must be immediately after the language and IDSeparator.
	* If the item at this position is not a script (is not four characters
	* long) leave index. Otherwise index is left at a terminator or
	* id separator.
	*/
	private void skipScript() {
	if (!atTerminator()) {
	int oldIndex = index;
	++index;

	skipUntilTerminatorOrIDSeparator();
	if (index - oldIndex != 5) { // +1 to account for separator
	index = oldIndex;
	}
	}
	}

	/**
	* Advance index past country, and accumulate normalized country in buffer.
	* Index must be immediately after the script (if there is one, else language)
	* and IDSeparator. Return the start of the country code in the buffer.
	*/
	private int parseCountry() {
	if (!atTerminator()) {
	int oldIndex = index;
	++index;

	int oldBlen = blen;
	char c;
	while (!isTerminatorOrIDSeparator(c = next())) {
	if (oldBlen == blen) { // first, add hyphen
	hadCountry = true; // we have a country, let variant parsing know
	addSeparator();
	++oldBlen; // increment past hyphen
	}
	append(Character.toUpperCase(c));
	}
	--index; // unget

	int charsAppended = blen - oldBlen;

	if (charsAppended == 0) {
	// Do nothing.
	}
	else if (charsAppended < 2 \|\| charsAppended > 3) {
	// It's not a country, so return index and blen to
	// their previous values.
	index = oldIndex;
	--oldBlen;
	blen = oldBlen;
	hadCountry = false;
	}
	else if (charsAppended == 3) {
	String region = LocaleIDs.threeToTwoLetterRegion(getString(oldBlen));
	if (region != null) {
	set(oldBlen, region);
	}
	}

	return oldBlen;
	}

	return blen;
	}

	/**
	* Advance index past country.
	* Index must be immediately after the script (if there is one, else language)
	* and IDSeparator.
	*/
	private void skipCountry() {
	if (!atTerminator()) {
	++index;
	/*
	* Save the index point after the separator, since the format
	* requires two separators if the country is not present.
	*/
	int oldIndex = index;

	skipUntilTerminatorOrIDSeparator();
	int charsSkipped = index - oldIndex;
	if (charsSkipped < 2 \|\| charsSkipped > 3) {
	index = oldIndex;
	}
	}
	}

	/**
	* Advance index past variant, and accumulate normalized variant in buffer. This ignores
	* the codepage information from POSIX ids. Index must be immediately after the country
	* or script. Index is left at the keyword separator or at the end of the text. Return
	* the start of the variant code in the buffer.
	*
	* In standard form, we can have the following forms:
	* ll__VVVV
	* ll_CC_VVVV
	* ll_Ssss_VVVV
	* ll_Ssss_CC_VVVV
	*
	* This also handles POSIX ids, which can have the following forms (pppp is code page id):
	* ll_CC.pppp --> ll_CC
	* ll_CC.pppp@VVVV --> ll_CC_VVVV
	* ll_CC@VVVV --> ll_CC_VVVV
	*
	* We identify this use of '@' in POSIX ids by looking for an '=' following
	* the '@'. If there is one, we consider '@' to start a keyword list, instead of
	* being part of a POSIX id.
	*
	* Note: since it was decided that we want an option to not handle POSIX ids, this
	* becomes a bit more complex.
	*/
	private int parseVariant() {
	int oldBlen = blen;

	boolean start = true;
	boolean needSeparator = true;
	boolean skipping = false;
	char c;
	while ((c = next()) != DONE) {
	if (c == DOT) {
	start = false;
	skipping = true;
	} else if (c == KEYWORD_SEPARATOR) {
	if (haveKeywordAssign()) {
	break;
	}
	skipping = false;
	start = false;
	needSeparator = true; // add another underscore if we have more text
	} else if (start) {
	start = false;
	} else if (!skipping) {
	if (needSeparator) {
	boolean incOldBlen = blen == oldBlen; // need to skip separators
	needSeparator = false;
	if (incOldBlen && !hadCountry) { // no country, we'll need two
	addSeparator();
	++oldBlen; // for sure
	}
	addSeparator();
	if (incOldBlen) { // only for the first separator
	++oldBlen;
	}
	}
	c = Character.toUpperCase(c);
	if (c == HYPHEN \|\| c == COMMA) {
	c = UNDERSCORE;
	}
	append(c);
	}
	}
	--index; // unget

	return oldBlen;
	}

	// no need for skipvariant, to get the keywords we'll just scan directly for
	// the keyword separator

	/**
	* Returns the normalized language id, or the empty string.
	*/
	public String getLanguage() {
	reset();
	return getString(parseLanguage());
	}

	/**
	* Returns the normalized script id, or the empty string.
	*/
	public String getScript() {
	reset();
	skipLanguage();
	return getString(parseScript());
	}

	/**
	* return the normalized country id, or the empty string.
	*/
	public String getCountry() {
	reset();
	skipLanguage();
	skipScript();
	return getString(parseCountry());
	}

	/**
	* Returns the normalized variant id, or the empty string.
	*/
	public String getVariant() {
	reset();
	skipLanguage();
	skipScript();
	skipCountry();
	return getString(parseVariant());
	}

	/**
	* Returns the language, script, country, and variant as separate strings.
	*/
	public String[] getLanguageScriptCountryVariant() {
	reset();
	return new String[] {
	getString(parseLanguage()),
	getString(parseScript()),
	getString(parseCountry()),
	getString(parseVariant())
	};
	}

	public void setBaseName(String baseName) {
	this.baseName = baseName;
	}

	public void parseBaseName() {
	if (baseName != null) {
	set(0, baseName);
	} else {
	reset();
	parseLanguage();
	parseScript();
	parseCountry();
	parseVariant();

	// catch unwanted trailing underscore after country if there was no variant
	if (blen > 1 && buffer[blen-1] == UNDERSCORE) {
	--blen;
	}
	}
	}

	/**
	* Returns the normalized base form of the locale id. The base
	* form does not include keywords.
	*/
	public String getBaseName() {
	if (baseName != null) {
	return baseName;
	}
	parseBaseName();
	return getString(0);
	}

	/**
	* Returns the normalized full form of the locale id. The full
	* form includes keywords if they are present.
	*/
	public String getName() {
	parseBaseName();
	parseKeywords();
	return getString(0);
	}

	// keyword utilities

	/**
	* If we have keywords, advance index to the start of the keywords and return true,
	* otherwise return false.
	*/
	private boolean setToKeywordStart() {
	for (int i = index; i < id.length; ++i) {
	if (id[i] == KEYWORD_SEPARATOR) {
	if (canonicalize) {
	for (int j = ++i; j < id.length; ++j) { // increment i past separator for return
	if (id[j] == KEYWORD_ASSIGN) {
	index = i;
	return true;
	}
	}
	} else {
	if (++i < id.length) {
	index = i;
	return true;
	}
	}
	break;
	}
	}
	return false;
	}

	private static boolean isDoneOrKeywordAssign(char c) {
	return c == DONE \|\| c == KEYWORD_ASSIGN;
	}

	private static boolean isDoneOrItemSeparator(char c) {
	return c == DONE \|\| c == ITEM_SEPARATOR;
	}

	private String getKeyword() {
	int start = index;
	while (!isDoneOrKeywordAssign(next())) {
	}
	--index;
	return AsciiUtil.toLowerString(new String(id, start, index-start).trim());
	}

	private String getValue() {
	int start = index;
	while (!isDoneOrItemSeparator(next())) {
	}
	--index;
	return new String(id, start, index-start).trim(); // leave case alone
	}

	private Comparator<String> getKeyComparator() {
	final Comparator<String> comp = new Comparator<String>() {
	public int compare(String lhs, String rhs) {
	return lhs.compareTo(rhs);
	}
	};
	return comp;
	}

	/**
	* Returns a map of the keywords and values, or null if there are none.
	*/
	public Map<String, String> getKeywordMap() {
	if (keywords == null) {
	TreeMap<String, String> m = null;
	if (setToKeywordStart()) {
	// trim spaces and convert to lower case, both keywords and values.
	do {
	String key = getKeyword();
	if (key.length() == 0) {
	break;
	}
	char c = next();
	if (c != KEYWORD_ASSIGN) {
	// throw new IllegalArgumentException("key '" + key + "' missing a value.");
	if (c == DONE) {
	break;
	} else {
	continue;
	}
	}
	String value = getValue();
	if (value.length() == 0) {
	// throw new IllegalArgumentException("key '" + key + "' missing a value.");
	continue;
	}
	if (m == null) {
	m = new TreeMap<String, String>(getKeyComparator());
	} else if (m.containsKey(key)) {
	// throw new IllegalArgumentException("key '" + key + "' already has a value.");
	continue;
	}
	m.put(key, value);
	} while (next() == ITEM_SEPARATOR);
	}
	keywords = m != null ? m : Collections.<String, String>emptyMap();
	}

	return keywords;
	}


	/**
	* Parse the keywords and return start of the string in the buffer.
	*/
	private int parseKeywords() {
	int oldBlen = blen;
	Map<String, String> m = getKeywordMap();
	if (!m.isEmpty()) {
	boolean first = true;
	for (Map.Entry<String, String> e : m.entrySet()) {
	append(first ? KEYWORD_SEPARATOR : ITEM_SEPARATOR);
	first = false;
	append(e.getKey());
	append(KEYWORD_ASSIGN);
	append(e.getValue());
	}
	if (blen != oldBlen) {
	++oldBlen;
	}
	}
	return oldBlen;
	}

	/**
	* Returns an iterator over the keywords, or null if we have an empty map.
	*/
	public Iterator<String> getKeywords() {
	Map<String, String> m = getKeywordMap();
	return m.isEmpty() ? null : m.keySet().iterator();
	}

	/**
	* Returns the value for the named keyword, or null if the keyword is not
	* present.
	*/
	public String getKeywordValue(String keywordName) {
	Map<String, String> m = getKeywordMap();
	return m.isEmpty() ? null : m.get(AsciiUtil.toLowerString(keywordName.trim()));
	}

	/**
	* Set the keyword value only if it is not already set to something else.
	*/
	public void defaultKeywordValue(String keywordName, String value) {
	setKeywordValue(keywordName, value, false);
	}

	/**
	* Set the value for the named keyword, or unset it if value is null. If
	* keywordName itself is null, unset all keywords. If keywordName is not null,
	* value must not be null.
	*/
	public void setKeywordValue(String keywordName, String value) {
	setKeywordValue(keywordName, value, true);
	}

	/**
	* Set the value for the named keyword, or unset it if value is null. If
	* keywordName itself is null, unset all keywords. If keywordName is not null,
	* value must not be null. If reset is true, ignore any previous value for
	* the keyword, otherwise do not change the keyword (including removal of
	* one or all keywords).
	*/
	private void setKeywordValue(String keywordName, String value, boolean reset) {
	if (keywordName == null) {
	if (reset) {
	// force new map, ignore value
	keywords = Collections.<String, String>emptyMap();
	}
	} else {
	keywordName = AsciiUtil.toLowerString(keywordName.trim());
	if (keywordName.length() == 0) {
	throw new IllegalArgumentException("keyword must not be empty");
	}
	if (value != null) {
	value = value.trim();
	if (value.length() == 0) {
	throw new IllegalArgumentException("value must not be empty");
	}
	}
	Map<String, String> m = getKeywordMap();
	if (m.isEmpty()) { // it is EMPTY_MAP
	if (value != null) {
	// force new map
	keywords = new TreeMap<String, String>(getKeyComparator());
	keywords.put(keywordName, value.trim());
	}
	} else {
	if (reset \|\| !m.containsKey(keywordName)) {
	if (value != null) {
	m.put(keywordName, value);
	} else {
	m.remove(keywordName);
	if (m.isEmpty()) {
	// force new map
	keywords = Collections.<String, String>emptyMap();
	}
	}
	}
	}
	}
	}
	}