blob: 5c19e3159c368bd40137edd3655865722cfb7219 [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 2009, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.impl.locale;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
import com.ibm.icu.impl.Utility;
public final class LanguageTag {
private String _languageTag = ""; // entire language tag
private String _grandfathered = ""; // grandfathered tag
private String _privateuse = ""; // privateuse, not including leading "x-"
private String _language = ""; // language subtag
private String[] _extlang; // array of extlang subtags
private String _script = ""; // script subtag
private String _region = ""; // region subtag
// private TreeSet<String> _variants; // variant subtags in a single string
private TreeSet _variants; // variant subtags in a single string
// private TreeSet<Extension> _extensions; // extension key/value pairs
private TreeSet _extensions; // extension key/value pairs
private static final int MINLEN = 2; // minimum length of a valid language tag
private static final String SEP = "-";
private static final char SEPCHAR = '-';
private static final String PRIVATEUSE = "x";
public static String UNDETERMINED = "und";
// Map contains grandfathered tags and its preferred mappings from
// http://www.ietf.org/internet-drafts/draft-ietf-ltru-4645bis-09.txt
// private static final HashMap<String,String> GRANDFATHERED = new HashMap<String,String>();
private static final HashMap GRANDFATHERED = new HashMap();
static {
final String[][] entries = {
//{"tag", "preferred"},
{"art-lojban", "jbo"},
{"cel-gaulish", ""},
{"en-GB-oed", ""},
{"i-ami", "ami"},
{"i-bnn", "bnn"},
{"i-default", ""},
{"i-enochian", ""},
{"i-hak", "hak"},
{"i-klingon", "tlh"},
{"i-lux", "lb"},
{"i-mingo", ""},
{"i-navajo", "nv"},
{"i-pwn", "pwn"},
{"i-tao", "tao"},
{"i-tay", "tay"},
{"i-tsu", "tsu"},
{"no-bok", "nb"},
{"no-nyn", "nn"},
{"sgn-BE-FR", "sfb"},
{"sgn-BE-NL", "vgt"},
{"sgn-CH-DE", "sgg"},
{"zh-guoyu", "cmn"},
{"zh-hakka", "hak"},
{"zh-min", ""},
{"zh-min-nan", "nan"},
{"zh-xiang", "hsn"},
};
// for (String[] e : entries) {
for (int i = 0; i < entries.length; i++) {
String[] e = entries[i];
GRANDFATHERED.put(e[0], e[1]);
}
}
private static final String[][] DEPRECATEDLANGS = {
// {<deprecated>, <current>},
{"iw", "he"},
{"ji", "yi"},
{"in", "id"},
};
private LanguageTag(String tag) {
_languageTag = tag;
}
// Bit flags used by the language tag parser
private static final int LANG = 0x0001;
private static final int EXTL = 0x0002;
private static final int SCRT = 0x0004;
private static final int REGN = 0x0008;
private static final int VART = 0x0010;
private static final int EXTS = 0x0020;
private static final int EXTV = 0x0040;
private static final int PRIV = 0x0080;
public static LanguageTag parse(String langtag) throws LocaleSyntaxException {
if (langtag.length() < MINLEN) {
throw new LocaleSyntaxException("The specified tag '"
+ langtag + "' is too short");
}
if (langtag.endsWith(SEP)) {
// This code utilizes String#split, which drops off the last empty segment.
// We need to check if the tag ends with '-' here.
int erridx = langtag.length() - 1;
while (erridx - 1 >= 0 && langtag.charAt(erridx - 1) != SEPCHAR) {
erridx--;
}
throw new LocaleSyntaxException("The specified tag '"
+ langtag + "' ends with " + SEP, erridx);
}
String tag = AsciiUtil.toLowerString(langtag);
LanguageTag t = new LanguageTag(tag);
// Check if the tag is grandfathered
if (GRANDFATHERED.containsKey(tag)) {
t._grandfathered = tag;
// Preferred mapping
// String preferred = GRANDFATHERED.get(tag);
String preferred = (String)GRANDFATHERED.get(tag);
if (preferred.length() > 0) {
t._language = preferred;
}
return t;
}
// langtag = language
// ["-" script]
// ["-" region]
// *("-" variant)
// *("-" extension)
// ["-" privateuse]
// String[] subtags = tag.split(SEP);
String[] subtags = Utility.split(tag, '-');
int idx = 0;
int extlangIdx = 0;
String extSingleton = null;
// StringBuilder extBuf = null;
StringBuffer extBuf = null;
int next = LANG | PRIV;
String errorMsg = null;
PARSE:
while (true) {
if (idx >= subtags.length) {
break;
}
if ((next & LANG) != 0) {
if (isLanguageSubtag(subtags[idx])) {
t._language = subtags[idx++];
next = EXTL | SCRT | REGN | VART | EXTS | PRIV;
continue;
}
}
if ((next & EXTL) != 0) {
if (isExtlangSubtag(subtags[idx])) {
if (extlangIdx == 0) {
t._extlang = new String[3];
}
t._extlang[extlangIdx++] = subtags[idx++];
if (extlangIdx < 3) {
next = EXTL | SCRT | REGN | VART | EXTS | PRIV;
} else {
next = SCRT | REGN | VART | EXTS | PRIV;
}
continue;
}
}
if ((next & SCRT) != 0) {
if (isScriptSubtag(subtags[idx])) {
t._script = subtags[idx++];
next = REGN | VART | EXTS | PRIV;
continue;
}
}
if ((next & REGN) != 0) {
if (isRegionSubtag(subtags[idx])) {
t._region = subtags[idx++];
next = VART | EXTS | PRIV;
continue;
}
}
if ((next & VART) != 0) {
if (isVariantSubtag(subtags[idx])) {
if (t._variants == null) {
// t._variants = new TreeSet<String>();
t._variants = new TreeSet();
}
t._variants.add(subtags[idx++]);
next = VART | EXTS | PRIV;
continue;
}
}
if ((next & EXTS) != 0) {
if (isExtensionSingleton(subtags[idx])) {
if (extSingleton != null) {
if (extBuf == null) {
errorMsg = "The specified tag '"
+ tag + "' contains an incomplete extension: "
+ extSingleton;
break PARSE;
}
// Emit the previous extension key/value pair
if (t._extensions == null) {
// t._extensions = new TreeSet<Extension>();
t._extensions = new TreeSet();
}
Extension e = new Extension(extSingleton.charAt(0), extBuf.toString());
t._extensions.add(e);
}
extSingleton = subtags[idx++];
extBuf = null; // Clear the extension value buffer
next = EXTV;
continue;
}
}
if ((next & EXTV) != 0) {
if (isExtensionSubtag(subtags[idx])) {
if (extBuf == null) {
// extBuf = new StringBuilder(subtags[idx++]);
extBuf = new StringBuffer(subtags[idx++]);
} else {
extBuf.append(SEP);
extBuf.append(subtags[idx++]);
}
next = EXTS | EXTV | PRIV;
continue;
}
}
if ((next & PRIV) != 0) {
if (AsciiUtil.caseIgnoreMatch(PRIVATEUSE, subtags[idx])) {
// The rest of part will be private use value subtags
// StringBuilder puBuf = new StringBuilder();
StringBuffer puBuf = new StringBuffer();
idx++;
for (boolean bFirst = true ; idx < subtags.length; idx++) {
if (!isPrivateuseValueSubtag(subtags[idx])) {
errorMsg = "The specified tag '"
+ langtag + "' contains an illegal private use subtag: "
+ (subtags[idx].length() == 0 ? "<empty>" : subtags[idx]);
break PARSE;
}
if (bFirst) {
bFirst = false;
} else {
puBuf.append(SEP);
}
puBuf.append(subtags[idx]);
}
t._privateuse = puBuf.toString();
if (t._privateuse.length() == 0) {
// Empty privateuse value
errorMsg = "The specified tag '"
+ langtag + "' contains an empty private use subtag";
break PARSE;
}
break;
}
}
// If we fell through here, it means this subtag is illegal
errorMsg = "The specified tag '" + langtag
+ "' contains an illegal subtag: "
+ (subtags[idx].length() == 0 ? "<empty>" : subtags[idx]);
break PARSE;
}
if (errorMsg == null) {
if (extSingleton != null) {
if (extBuf == null) {
// extension singleton without following extension value
errorMsg = "The specified tag '"
+ langtag + "' contains an incomplete extension: "
+ extSingleton;
} else {
// Emit the last extension key/value pair
if (t._extensions == null) {
// t._extensions = new TreeSet<Extension>();
t._extensions = new TreeSet();
}
Extension e = new Extension(extSingleton.charAt(0), extBuf.toString());
t._extensions.add(e);
}
}
}
if (errorMsg != null) {
// restore the original string index
int errIndex = 0;
for (int i = 0; i < idx; i++) {
errIndex += (subtags[i].length() + 1);
}
throw new LocaleSyntaxException(errorMsg, errIndex);
}
return t;
}
public String getTag() {
return _languageTag;
}
public String getLanguage() {
return _language;
}
public String getJDKLanguage() {
String lang = _language;
// for (String[] langMap : DEPRECATEDLANGS) {
for (int i = 0; i < DEPRECATEDLANGS.length; i++) {
String[] langMap = DEPRECATEDLANGS[i];
if (AsciiUtil.caseIgnoreCompare(lang, langMap[1]) == 0) {
// use the old code
lang = langMap[0];
break;
}
}
return lang;
}
public String getExtlang(int idx) {
if (_extlang != null && idx < _extlang.length) {
return _extlang[idx];
}
return null;
}
public String getScript() {
return _script;
}
public String getRegion() {
return _region;
}
public String getVariant() {
if (_variants != null) {
// StringBuilder buf = new StringBuilder();
StringBuffer buf = new StringBuffer();
// Iterator<String> itr = _variants.iterator();
Iterator itr = _variants.iterator();
while (itr.hasNext()) {
if (buf.length() > 0) {
buf.append(SEP);
}
buf.append(itr.next());
}
return buf.toString();
}
return "";
}
// public Set<String> getVarinats() {
public Set getVarinats() {
return Collections.unmodifiableSet(_variants);
}
// public Set<Extension> getExtensions() {
public Set getExtensions() {
if (_extensions != null) {
return Collections.unmodifiableSet(_extensions);
}
return null;
}
public String getPrivateUse() {
return _privateuse;
}
public String getGrandfathered() {
return _grandfathered;
}
public static boolean isLanguageSubtag(String s) {
// language = 2*3ALPHA ; shortest ISO 639 code
// ["-" extlang] ; sometimes followed by
// ; extended language subtags
// / 4ALPHA ; or reserved for future use
// / 5*8ALPHA ; or registered language subtag
return (s.length() >= 2) && (s.length() <= 8) && AsciiUtil.isAlphaString(s);
}
public static boolean isExtlangSubtag(String s) {
// extlang = 3ALPHA ; selected ISO 639 codes
// *2("-" 3ALPHA) ; permanently reserved
return (s.length() == 3) && AsciiUtil.isAlphaString(s);
}
public static boolean isScriptSubtag(String s) {
// script = 4ALPHA ; ISO 15924 code
return (s.length() == 4) && AsciiUtil.isAlphaString(s);
}
public static boolean isRegionSubtag(String s) {
// region = 2ALPHA ; ISO 3166-1 code
// / 3DIGIT ; UN M.49 code
return ((s.length() == 2) && AsciiUtil.isAlphaString(s))
|| ((s.length() == 3) && AsciiUtil.isNumericString(s));
}
public static boolean isVariantSubtag(String s) {
// variant = 5*8alphanum ; registered variants
// / (DIGIT 3alphanum)
int len = s.length();
if (len >= 5 && len <= 8) {
return AsciiUtil.isAlphaNumericString(s);
}
if (len == 4) {
return AsciiUtil.isNumeric(s.charAt(0))
&& AsciiUtil.isAlphaNumeric(s.charAt(1))
&& AsciiUtil.isAlphaNumeric(s.charAt(2))
&& AsciiUtil.isAlphaNumeric(s.charAt(3));
}
return false;
}
public static boolean isExtensionSingleton(String s) {
// extension = singleton 1*("-" (2*8alphanum))
return (s.length() == 1)
&& AsciiUtil.isAlphaString(s)
&& !AsciiUtil.caseIgnoreMatch(PRIVATEUSE, s);
}
public static boolean isExtensionSubtag(String s) {
// extension = singleton 1*("-" (2*8alphanum))
return (s.length() >= 2) && (s.length() <= 8) && AsciiUtil.isAlphaNumericString(s);
}
public static boolean isPrivateuseValueSubtag(String s) {
// privateuse = "x" 1*("-" (1*8alphanum))
return (s.length() >= 1) && (s.length() <= 8) && AsciiUtil.isAlphaNumericString(s);
}
/*
* Language tag extension key/value container
*/
// public static class Extension implements Comparable<Extension> {
public static class Extension implements Comparable {
private char _singleton;
private String _value;
public Extension(char singleton, String value) {
_singleton = AsciiUtil.toLower(singleton);
_value = value;
}
public char getSingleton() {
return _singleton;
}
public String getValue() {
return _value;
}
public int compareTo(Extension other) {
return (int)_singleton - (int)other._singleton;
}
public int compareTo(Object obj) {
return compareTo((Extension)obj);
}
}
public static String toLanguageTag(BaseLocale base, LocaleExtensions ext) {
// StringBuilder buf = new StringBuilder();
StringBuffer buf = new StringBuffer();
// language
String language = base.getLanguage();
if (language.length() == 0) {
buf.append(UNDETERMINED);
} else {
if (isLanguageSubtag(language)) {
// if deprecated language code, map to the current one
// for (String[] langMap : DEPRECATEDLANGS) {
for (int i = 0; i < DEPRECATEDLANGS.length; i++) {
String[] langMap = DEPRECATEDLANGS[i];
if (AsciiUtil.caseIgnoreCompare(language, langMap[0]) == 0) {
language = langMap[1];
break;
}
}
buf.append(language);
} else {
buf.append(UNDETERMINED);
}
}
// script
String script = base.getScript();
if (script.length() > 0 && isScriptSubtag(script)) {
buf.append(SEP);
buf.append(AsciiUtil.toLowerString(script));
}
// region
String region = base.getRegion();
if (region.length() > 0 && isRegionSubtag(region)) {
buf.append(SEP);
buf.append(AsciiUtil.toLowerString(region));
}
// variant
String variant = base.getVariant();
if (variant.length() > 0) {
// String[] variants = variant.split("_");
String[] variants = Utility.split(variant, '_');
// for (String var : variants) {
for (int i = 0; i < variants.length; i++) {
String var = variants[i];
if (isVariantSubtag(var)) {
buf.append(SEP);
buf.append(AsciiUtil.toLowerString(var));
}
}
}
if (ext != null) {
String exttags = ext.getCanonicalString();
if (exttags.length() > 0) {
// extensions including private use
buf.append(SEP);
buf.append(ext.getCanonicalString());
}
}
return buf.toString();
}
}