/* | |
****************************************************************************** | |
* Copyright (C) 2003-2011, International Business Machines Corporation and * | |
* others. All Rights Reserved. * | |
****************************************************************************** | |
*/ | |
package com.ibm.icu.impl; | |
import java.util.Collections; | |
import java.util.Comparator; | |
import java.util.Iterator; | |
import java.util.Map; | |
import java.util.TreeMap; | |
import com.ibm.icu.impl.locale.AsciiUtil; | |
/** | |
* Utility class to parse and normalize locale ids (including POSIX style) | |
*/ | |
public final class LocaleIDParser { | |
private char[] id; | |
private int index; | |
private char[] buffer; | |
private int blen; | |
// um, don't handle POSIX ids unless we request it. why not? well... because. | |
private boolean canonicalize; | |
private boolean hadCountry; | |
// used when canonicalizing | |
Map<String, String> keywords; | |
String baseName; | |
/** | |
* Parsing constants. | |
*/ | |
private static final char KEYWORD_SEPARATOR = '@'; | |
private static final char HYPHEN = '-'; | |
private static final char KEYWORD_ASSIGN = '='; | |
private static final char COMMA = ','; | |
private static final char ITEM_SEPARATOR = ';'; | |
private static final char DOT = '.'; | |
private static final char UNDERSCORE = '_'; | |
public LocaleIDParser(String localeID) { | |
this(localeID, false); | |
} | |
public LocaleIDParser(String localeID, boolean canonicalize) { | |
id = localeID.toCharArray(); | |
index = 0; | |
buffer = new char[id.length + 5]; | |
blen = 0; | |
this.canonicalize = canonicalize; | |
} | |
private void reset() { | |
index = blen = 0; | |
} | |
// utilities for working on text in the buffer | |
/** | |
* Append c to the buffer. | |
*/ | |
private void append(char c) { | |
try { | |
buffer[blen] = c; | |
} | |
catch (IndexOutOfBoundsException e) { | |
if (buffer.length > 512) { | |
// something is seriously wrong, let this go | |
throw e; | |
} | |
char[] nbuffer = new char[buffer.length * 2]; | |
System.arraycopy(buffer, 0, nbuffer, 0, buffer.length); | |
nbuffer[blen] = c; | |
buffer = nbuffer; | |
} | |
++blen; | |
} | |
private void addSeparator() { | |
append(UNDERSCORE); | |
} | |
/** | |
* Returns the text in the buffer from start to blen as a String. | |
*/ | |
private String getString(int start) { | |
if (start == blen) { | |
return ""; | |
} | |
return new String(buffer, start, blen-start); | |
} | |
/** | |
* Set the length of the buffer to pos, then append the string. | |
*/ | |
private void set(int pos, String s) { | |
this.blen = pos; // no safety | |
append(s); | |
} | |
/** | |
* Append the string to the buffer. | |
*/ | |
private void append(String s) { | |
for (int i = 0; i < s.length(); ++i) { | |
append(s.charAt(i)); | |
} | |
} | |
// utilities for parsing text out of the id | |
/** | |
* Character to indicate no more text is available in the id. | |
*/ | |
private static final char DONE = '\uffff'; | |
/** | |
* Returns the character at index in the id, and advance index. The returned character | |
* is DONE if index was at the limit of the buffer. The index is advanced regardless | |
* so that decrementing the index will always 'unget' the last character returned. | |
*/ | |
private char next() { | |
if (index == id.length) { | |
index++; | |
return DONE; | |
} | |
return id[index++]; | |
} | |
/** | |
* Advance index until the next terminator or id separator, and leave it there. | |
*/ | |
private void skipUntilTerminatorOrIDSeparator() { | |
while (!isTerminatorOrIDSeparator(next())) { | |
} | |
--index; | |
} | |
/** | |
* Returns true if the character at index in the id is a terminator. | |
*/ | |
private boolean atTerminator() { | |
return index >= id.length || isTerminator(id[index]); | |
} | |
/* | |
* Returns true if the character is an id separator (underscore or hyphen). | |
*/ | |
/* private boolean isIDSeparator(char c) { | |
return c == UNDERSCORE || c == HYPHEN; | |
}*/ | |
/** | |
* Returns true if the character is a terminator (keyword separator, dot, or DONE). | |
* Dot is a terminator because of the POSIX form, where dot precedes the codepage. | |
*/ | |
private boolean isTerminator(char c) { | |
// always terminate at DOT, even if not handling POSIX. It's an error... | |
return c == KEYWORD_SEPARATOR || c == DONE || c == DOT; | |
} | |
/** | |
* Returns true if the character is a terminator or id separator. | |
*/ | |
private boolean isTerminatorOrIDSeparator(char c) { | |
return c == KEYWORD_SEPARATOR || c == UNDERSCORE || c == HYPHEN || | |
c == DONE || c == DOT; | |
} | |
/** | |
* Returns true if the start of the buffer has an experimental or private language | |
* prefix, the pattern '[ixIX][-_].' shows the syntax checked. | |
*/ | |
private boolean haveExperimentalLanguagePrefix() { | |
if (id.length > 2) { | |
char c = id[1]; | |
if (c == HYPHEN || c == UNDERSCORE) { | |
c = id[0]; | |
return c == 'x' || c == 'X' || c == 'i' || c == 'I'; | |
} | |
} | |
return false; | |
} | |
/** | |
* Returns true if a value separator occurs at or after index. | |
*/ | |
private boolean haveKeywordAssign() { | |
// assume it is safe to start from index | |
for (int i = index; i < id.length; ++i) { | |
if (id[i] == KEYWORD_ASSIGN) { | |
return true; | |
} | |
} | |
return false; | |
} | |
/** | |
* Advance index past language, and accumulate normalized language code in buffer. | |
* Index must be at 0 when this is called. Index is left at a terminator or id | |
* separator. Returns the start of the language code in the buffer. | |
*/ | |
private int parseLanguage() { | |
if (haveExperimentalLanguagePrefix()) { | |
append(Character.toLowerCase(id[0])); | |
append(HYPHEN); | |
index = 2; | |
} | |
char c; | |
while(!isTerminatorOrIDSeparator(c = next())) { | |
append(Character.toLowerCase(c)); | |
} | |
--index; // unget | |
if (blen == 3) { | |
String lang = LocaleIDs.threeToTwoLetterLanguage(getString(0)); | |
if (lang != null) { | |
set(0, lang); | |
} | |
} | |
return 0; | |
} | |
/** | |
* Advance index past language. Index must be at 0 when this is called. Index | |
* is left at a terminator or id separator. | |
*/ | |
private void skipLanguage() { | |
if (haveExperimentalLanguagePrefix()) { | |
index = 2; | |
} | |
skipUntilTerminatorOrIDSeparator(); | |
} | |
/** | |
* Advance index past script, and accumulate normalized script in buffer. | |
* Index must be immediately after the language. | |
* If the item at this position is not a script (is not four characters | |
* long) leave index and buffer unchanged. Otherwise index is left at | |
* a terminator or id separator. Returns the start of the script code | |
* in the buffer (this may be equal to the buffer length, if there is no | |
* script). | |
*/ | |
private int parseScript() { | |
if (!atTerminator()) { | |
int oldIndex = index; // save original index | |
++index; | |
int oldBlen = blen; // get before append hyphen, if we truncate everything is undone | |
char c; | |
while(!isTerminatorOrIDSeparator(c = next())) { | |
if (blen == oldBlen) { // first pass | |
addSeparator(); | |
append(Character.toUpperCase(c)); | |
} else { | |
append(Character.toLowerCase(c)); | |
} | |
} | |
--index; // unget | |
/* If it's not exactly 4 characters long, then it's not a script. */ | |
if (index - oldIndex != 5) { // +1 to account for separator | |
index = oldIndex; | |
blen = oldBlen; | |
} else { | |
oldBlen++; // index past hyphen, for clients who want to extract just the script | |
} | |
return oldBlen; | |
} | |
return blen; | |
} | |
/** | |
* Advance index past script. | |
* Index must be immediately after the language and IDSeparator. | |
* If the item at this position is not a script (is not four characters | |
* long) leave index. Otherwise index is left at a terminator or | |
* id separator. | |
*/ | |
private void skipScript() { | |
if (!atTerminator()) { | |
int oldIndex = index; | |
++index; | |
skipUntilTerminatorOrIDSeparator(); | |
if (index - oldIndex != 5) { // +1 to account for separator | |
index = oldIndex; | |
} | |
} | |
} | |
/** | |
* Advance index past country, and accumulate normalized country in buffer. | |
* Index must be immediately after the script (if there is one, else language) | |
* and IDSeparator. Return the start of the country code in the buffer. | |
*/ | |
private int parseCountry() { | |
if (!atTerminator()) { | |
int oldIndex = index; | |
++index; | |
int oldBlen = blen; | |
char c; | |
while (!isTerminatorOrIDSeparator(c = next())) { | |
if (oldBlen == blen) { // first, add hyphen | |
hadCountry = true; // we have a country, let variant parsing know | |
addSeparator(); | |
++oldBlen; // increment past hyphen | |
} | |
append(Character.toUpperCase(c)); | |
} | |
--index; // unget | |
int charsAppended = blen - oldBlen; | |
if (charsAppended == 0) { | |
// Do nothing. | |
} | |
else if (charsAppended < 2 || charsAppended > 3) { | |
// It's not a country, so return index and blen to | |
// their previous values. | |
index = oldIndex; | |
--oldBlen; | |
blen = oldBlen; | |
hadCountry = false; | |
} | |
else if (charsAppended == 3) { | |
String region = LocaleIDs.threeToTwoLetterRegion(getString(oldBlen)); | |
if (region != null) { | |
set(oldBlen, region); | |
} | |
} | |
return oldBlen; | |
} | |
return blen; | |
} | |
/** | |
* Advance index past country. | |
* Index must be immediately after the script (if there is one, else language) | |
* and IDSeparator. | |
*/ | |
private void skipCountry() { | |
if (!atTerminator()) { | |
++index; | |
/* | |
* Save the index point after the separator, since the format | |
* requires two separators if the country is not present. | |
*/ | |
int oldIndex = index; | |
skipUntilTerminatorOrIDSeparator(); | |
int charsSkipped = index - oldIndex; | |
if (charsSkipped < 2 || charsSkipped > 3) { | |
index = oldIndex; | |
} | |
} | |
} | |
/** | |
* Advance index past variant, and accumulate normalized variant in buffer. This ignores | |
* the codepage information from POSIX ids. Index must be immediately after the country | |
* or script. Index is left at the keyword separator or at the end of the text. Return | |
* the start of the variant code in the buffer. | |
* | |
* In standard form, we can have the following forms: | |
* ll__VVVV | |
* ll_CC_VVVV | |
* ll_Ssss_VVVV | |
* ll_Ssss_CC_VVVV | |
* | |
* This also handles POSIX ids, which can have the following forms (pppp is code page id): | |
* ll_CC.pppp --> ll_CC | |
* ll_CC.pppp@VVVV --> ll_CC_VVVV | |
* ll_CC@VVVV --> ll_CC_VVVV | |
* | |
* We identify this use of '@' in POSIX ids by looking for an '=' following | |
* the '@'. If there is one, we consider '@' to start a keyword list, instead of | |
* being part of a POSIX id. | |
* | |
* Note: since it was decided that we want an option to not handle POSIX ids, this | |
* becomes a bit more complex. | |
*/ | |
private int parseVariant() { | |
int oldBlen = blen; | |
boolean start = true; | |
boolean needSeparator = true; | |
boolean skipping = false; | |
char c; | |
while ((c = next()) != DONE) { | |
if (c == DOT) { | |
start = false; | |
skipping = true; | |
} else if (c == KEYWORD_SEPARATOR) { | |
if (haveKeywordAssign()) { | |
break; | |
} | |
skipping = false; | |
start = false; | |
needSeparator = true; // add another underscore if we have more text | |
} else if (start) { | |
start = false; | |
} else if (!skipping) { | |
if (needSeparator) { | |
boolean incOldBlen = blen == oldBlen; // need to skip separators | |
needSeparator = false; | |
if (incOldBlen && !hadCountry) { // no country, we'll need two | |
addSeparator(); | |
++oldBlen; // for sure | |
} | |
addSeparator(); | |
if (incOldBlen) { // only for the first separator | |
++oldBlen; | |
} | |
} | |
c = Character.toUpperCase(c); | |
if (c == HYPHEN || c == COMMA) { | |
c = UNDERSCORE; | |
} | |
append(c); | |
} | |
} | |
--index; // unget | |
return oldBlen; | |
} | |
// no need for skipvariant, to get the keywords we'll just scan directly for | |
// the keyword separator | |
/** | |
* Returns the normalized language id, or the empty string. | |
*/ | |
public String getLanguage() { | |
reset(); | |
return getString(parseLanguage()); | |
} | |
/** | |
* Returns the normalized script id, or the empty string. | |
*/ | |
public String getScript() { | |
reset(); | |
skipLanguage(); | |
return getString(parseScript()); | |
} | |
/** | |
* return the normalized country id, or the empty string. | |
*/ | |
public String getCountry() { | |
reset(); | |
skipLanguage(); | |
skipScript(); | |
return getString(parseCountry()); | |
} | |
/** | |
* Returns the normalized variant id, or the empty string. | |
*/ | |
public String getVariant() { | |
reset(); | |
skipLanguage(); | |
skipScript(); | |
skipCountry(); | |
return getString(parseVariant()); | |
} | |
/** | |
* Returns the language, script, country, and variant as separate strings. | |
*/ | |
public String[] getLanguageScriptCountryVariant() { | |
reset(); | |
return new String[] { | |
getString(parseLanguage()), | |
getString(parseScript()), | |
getString(parseCountry()), | |
getString(parseVariant()) | |
}; | |
} | |
public void setBaseName(String baseName) { | |
this.baseName = baseName; | |
} | |
public void parseBaseName() { | |
if (baseName != null) { | |
set(0, baseName); | |
} else { | |
reset(); | |
parseLanguage(); | |
parseScript(); | |
parseCountry(); | |
parseVariant(); | |
// catch unwanted trailing underscore after country if there was no variant | |
if (blen > 1 && buffer[blen-1] == UNDERSCORE) { | |
--blen; | |
} | |
} | |
} | |
/** | |
* Returns the normalized base form of the locale id. The base | |
* form does not include keywords. | |
*/ | |
public String getBaseName() { | |
if (baseName != null) { | |
return baseName; | |
} | |
parseBaseName(); | |
return getString(0); | |
} | |
/** | |
* Returns the normalized full form of the locale id. The full | |
* form includes keywords if they are present. | |
*/ | |
public String getName() { | |
parseBaseName(); | |
parseKeywords(); | |
return getString(0); | |
} | |
// keyword utilities | |
/** | |
* If we have keywords, advance index to the start of the keywords and return true, | |
* otherwise return false. | |
*/ | |
private boolean setToKeywordStart() { | |
for (int i = index; i < id.length; ++i) { | |
if (id[i] == KEYWORD_SEPARATOR) { | |
if (canonicalize) { | |
for (int j = ++i; j < id.length; ++j) { // increment i past separator for return | |
if (id[j] == KEYWORD_ASSIGN) { | |
index = i; | |
return true; | |
} | |
} | |
} else { | |
if (++i < id.length) { | |
index = i; | |
return true; | |
} | |
} | |
break; | |
} | |
} | |
return false; | |
} | |
private static boolean isDoneOrKeywordAssign(char c) { | |
return c == DONE || c == KEYWORD_ASSIGN; | |
} | |
private static boolean isDoneOrItemSeparator(char c) { | |
return c == DONE || c == ITEM_SEPARATOR; | |
} | |
private String getKeyword() { | |
int start = index; | |
while (!isDoneOrKeywordAssign(next())) { | |
} | |
--index; | |
return AsciiUtil.toLowerString(new String(id, start, index-start).trim()); | |
} | |
private String getValue() { | |
int start = index; | |
while (!isDoneOrItemSeparator(next())) { | |
} | |
--index; | |
return new String(id, start, index-start).trim(); // leave case alone | |
} | |
private Comparator<String> getKeyComparator() { | |
final Comparator<String> comp = new Comparator<String>() { | |
public int compare(String lhs, String rhs) { | |
return lhs.compareTo(rhs); | |
} | |
}; | |
return comp; | |
} | |
/** | |
* Returns a map of the keywords and values, or null if there are none. | |
*/ | |
public Map<String, String> getKeywordMap() { | |
if (keywords == null) { | |
TreeMap<String, String> m = null; | |
if (setToKeywordStart()) { | |
// trim spaces and convert to lower case, both keywords and values. | |
do { | |
String key = getKeyword(); | |
if (key.length() == 0) { | |
break; | |
} | |
char c = next(); | |
if (c != KEYWORD_ASSIGN) { | |
// throw new IllegalArgumentException("key '" + key + "' missing a value."); | |
if (c == DONE) { | |
break; | |
} else { | |
continue; | |
} | |
} | |
String value = getValue(); | |
if (value.length() == 0) { | |
// throw new IllegalArgumentException("key '" + key + "' missing a value."); | |
continue; | |
} | |
if (m == null) { | |
m = new TreeMap<String, String>(getKeyComparator()); | |
} else if (m.containsKey(key)) { | |
// throw new IllegalArgumentException("key '" + key + "' already has a value."); | |
continue; | |
} | |
m.put(key, value); | |
} while (next() == ITEM_SEPARATOR); | |
} | |
keywords = m != null ? m : Collections.<String, String>emptyMap(); | |
} | |
return keywords; | |
} | |
/** | |
* Parse the keywords and return start of the string in the buffer. | |
*/ | |
private int parseKeywords() { | |
int oldBlen = blen; | |
Map<String, String> m = getKeywordMap(); | |
if (!m.isEmpty()) { | |
boolean first = true; | |
for (Map.Entry<String, String> e : m.entrySet()) { | |
append(first ? KEYWORD_SEPARATOR : ITEM_SEPARATOR); | |
first = false; | |
append(e.getKey()); | |
append(KEYWORD_ASSIGN); | |
append(e.getValue()); | |
} | |
if (blen != oldBlen) { | |
++oldBlen; | |
} | |
} | |
return oldBlen; | |
} | |
/** | |
* Returns an iterator over the keywords, or null if we have an empty map. | |
*/ | |
public Iterator<String> getKeywords() { | |
Map<String, String> m = getKeywordMap(); | |
return m.isEmpty() ? null : m.keySet().iterator(); | |
} | |
/** | |
* Returns the value for the named keyword, or null if the keyword is not | |
* present. | |
*/ | |
public String getKeywordValue(String keywordName) { | |
Map<String, String> m = getKeywordMap(); | |
return m.isEmpty() ? null : m.get(AsciiUtil.toLowerString(keywordName.trim())); | |
} | |
/** | |
* Set the keyword value only if it is not already set to something else. | |
*/ | |
public void defaultKeywordValue(String keywordName, String value) { | |
setKeywordValue(keywordName, value, false); | |
} | |
/** | |
* Set the value for the named keyword, or unset it if value is null. If | |
* keywordName itself is null, unset all keywords. If keywordName is not null, | |
* value must not be null. | |
*/ | |
public void setKeywordValue(String keywordName, String value) { | |
setKeywordValue(keywordName, value, true); | |
} | |
/** | |
* Set the value for the named keyword, or unset it if value is null. If | |
* keywordName itself is null, unset all keywords. If keywordName is not null, | |
* value must not be null. If reset is true, ignore any previous value for | |
* the keyword, otherwise do not change the keyword (including removal of | |
* one or all keywords). | |
*/ | |
private void setKeywordValue(String keywordName, String value, boolean reset) { | |
if (keywordName == null) { | |
if (reset) { | |
// force new map, ignore value | |
keywords = Collections.<String, String>emptyMap(); | |
} | |
} else { | |
keywordName = AsciiUtil.toLowerString(keywordName.trim()); | |
if (keywordName.length() == 0) { | |
throw new IllegalArgumentException("keyword must not be empty"); | |
} | |
if (value != null) { | |
value = value.trim(); | |
if (value.length() == 0) { | |
throw new IllegalArgumentException("value must not be empty"); | |
} | |
} | |
Map<String, String> m = getKeywordMap(); | |
if (m.isEmpty()) { // it is EMPTY_MAP | |
if (value != null) { | |
// force new map | |
keywords = new TreeMap<String, String>(getKeyComparator()); | |
keywords.put(keywordName, value.trim()); | |
} | |
} else { | |
if (reset || !m.containsKey(keywordName)) { | |
if (value != null) { | |
m.put(keywordName, value); | |
} else { | |
m.remove(keywordName); | |
if (m.isEmpty()) { | |
// force new map | |
keywords = Collections.<String, String>emptyMap(); | |
} | |
} | |
} | |
} | |
} | |
} | |
} |