blob: 0d1c259b9745c723739a50c9500e46fec4406d12 [file] [log] [blame]
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.impl;
import com.ibm.icu.util.ULocale;
public final class CaseMap {
/**
* Implementation of UCaseProps.ContextIterator, iterates over a String.
* See ustrcase.c/utf16_caseContextIterator().
*/
public static final class StringContextIterator implements UCaseProps.ContextIterator {
/**
* Constructor.
* @param s String to iterate over.
*/
public StringContextIterator(String s) {
this.s=s;
limit=s.length();
cpStart=cpLimit=index=0;
dir=0;
}
/**
* Set the iteration limit for nextCaseMapCP() to an index within the string.
* If the limit parameter is negative or past the string, then the
* string length is restored as the iteration limit.
*
* <p>This limit does not affect the next() function which always
* iterates to the very end of the string.
*
* @param lim The iteration limit.
*/
public void setLimit(int lim) {
if(0<=lim && lim<=s.length()) {
limit=lim;
} else {
limit=s.length();
}
}
/**
* Move to the iteration limit without fetching code points up to there.
*/
public void moveToLimit() {
cpStart=cpLimit=limit;
}
/**
* Iterate forward through the string to fetch the next code point
* to be case-mapped, and set the context indexes for it.
*
* <p>When the iteration limit is reached (and -1 is returned),
* getCPStart() will be at the iteration limit.
*
* <p>Iteration with next() does not affect the position for nextCaseMapCP().
*
* @return The next code point to be case-mapped, or <0 when the iteration is done.
*/
public int nextCaseMapCP() {
cpStart=cpLimit;
if(cpLimit<limit) {
int c=s.codePointAt(cpLimit);
cpLimit+=Character.charCount(c);
return c;
} else {
return -1;
}
}
/**
* Returns the start of the code point that was last returned
* by nextCaseMapCP().
*/
public int getCPStart() {
return cpStart;
}
/**
* Returns the limit of the code point that was last returned
* by nextCaseMapCP().
*/
public int getCPLimit() {
return cpLimit;
}
// implement UCaseProps.ContextIterator
// The following code is not used anywhere in this private class
@Override
public void reset(int direction) {
if(direction>0) {
/* reset for forward iteration */
dir=1;
index=cpLimit;
} else if(direction<0) {
/* reset for backward iteration */
dir=-1;
index=cpStart;
} else {
// not a valid direction
dir=0;
index=0;
}
}
@Override
public int next() {
int c;
if(dir>0 && index<s.length()) {
c=s.codePointAt(index);
index+=Character.charCount(c);
return c;
} else if(dir<0 && index>0) {
c=s.codePointBefore(index);
index-=Character.charCount(c);
return c;
}
return -1;
}
// variables
protected String s;
protected int index, limit, cpStart, cpLimit;
protected int dir; // 0=initial state >0=forward <0=backward
}
/** Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}. */
private static final void appendResult(int c, StringBuilder result) {
// Decode the result.
if (c < 0) {
// (not) original code point
result.appendCodePoint(~c);
} else if (c <= UCaseProps.MAX_STRING_LENGTH) {
// The mapping has already been appended to result.
} else {
// Append the single-code point mapping.
result.appendCodePoint(c);
}
}
// TODO: Move the other string case mapping functions from UCharacter to here, too.
public static String toUpper(ULocale locale, String str) {
if (locale == null) {
locale = ULocale.getDefault();
}
int[] locCache = new int[] { UCaseProps.getCaseLocale(locale, null) };
if (locCache[0] == UCaseProps.LOC_GREEK) {
return GreekUpper.toUpper(str, locCache);
}
StringContextIterator iter = new StringContextIterator(str);
StringBuilder result = new StringBuilder(str.length());
int c;
while((c=iter.nextCaseMapCP())>=0) {
c = UCaseProps.INSTANCE.toFullUpper(c, iter, result, locale, locCache);
appendResult(c, result);
}
return result.toString();
}
private static final class GreekUpper {
// Data bits.
private static final int UPPER_MASK = 0x3ff;
private static final int HAS_VOWEL = 0x1000;
private static final int HAS_YPOGEGRAMMENI = 0x2000;
private static final int HAS_ACCENT = 0x4000;
private static final int HAS_DIALYTIKA = 0x8000;
// Further bits during data building and processing, not stored in the data map.
private static final int HAS_COMBINING_DIALYTIKA = 0x10000;
private static final int HAS_OTHER_GREEK_DIACRITIC = 0x20000;
private static final int HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT;
private static final int HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA =
HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA;
private static final int HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA;
// State bits.
private static final int AFTER_CASED = 1;
private static final int AFTER_VOWEL_WITH_ACCENT = 2;
// Data generated by prototype code, see
// http://site.icu-project.org/design/case/greek-upper
// TODO: Move this data into ucase.icu.
private static final char[] data0370 = {
// U+0370..03FF
0x0370, // Ͱ
0x0370, // ͱ
0x0372, // Ͳ
0x0372, // ͳ
0,
0,
0x0376, // Ͷ
0x0376, // ͷ
0,
0,
0x037A, // ͺ
0x03FD, // ͻ
0x03FE, // ͼ
0x03FF, // ͽ
0,
0x037F, // Ϳ
0,
0,
0,
0,
0,
0,
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ά
0,
0x0395 | HAS_VOWEL | HAS_ACCENT, // Έ
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ή
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ί
0,
0x039F | HAS_VOWEL | HAS_ACCENT, // Ό
0,
0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ύ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ώ
0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΐ
0x0391 | HAS_VOWEL, // Α
0x0392, // Β
0x0393, // Γ
0x0394, // Δ
0x0395 | HAS_VOWEL, // Ε
0x0396, // Ζ
0x0397 | HAS_VOWEL, // Η
0x0398, // Θ
0x0399 | HAS_VOWEL, // Ι
0x039A, // Κ
0x039B, // Λ
0x039C, // Μ
0x039D, // Ν
0x039E, // Ξ
0x039F | HAS_VOWEL, // Ο
0x03A0, // Π
0x03A1, // Ρ
0,
0x03A3, // Σ
0x03A4, // Τ
0x03A5 | HAS_VOWEL, // Υ
0x03A6, // Φ
0x03A7, // Χ
0x03A8, // Ψ
0x03A9 | HAS_VOWEL, // Ω
0x0399 | HAS_VOWEL | HAS_DIALYTIKA, // Ϊ
0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, // Ϋ
0x0391 | HAS_VOWEL | HAS_ACCENT, // ά
0x0395 | HAS_VOWEL | HAS_ACCENT, // έ
0x0397 | HAS_VOWEL | HAS_ACCENT, // ή
0x0399 | HAS_VOWEL | HAS_ACCENT, // ί
0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΰ
0x0391 | HAS_VOWEL, // α
0x0392, // β
0x0393, // γ
0x0394, // δ
0x0395 | HAS_VOWEL, // ε
0x0396, // ζ
0x0397 | HAS_VOWEL, // η
0x0398, // θ
0x0399 | HAS_VOWEL, // ι
0x039A, // κ
0x039B, // λ
0x039C, // μ
0x039D, // ν
0x039E, // ξ
0x039F | HAS_VOWEL, // ο
0x03A0, // π
0x03A1, // ρ
0x03A3, // ς
0x03A3, // σ
0x03A4, // τ
0x03A5 | HAS_VOWEL, // υ
0x03A6, // φ
0x03A7, // χ
0x03A8, // ψ
0x03A9 | HAS_VOWEL, // ω
0x0399 | HAS_VOWEL | HAS_DIALYTIKA, // ϊ
0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, // ϋ
0x039F | HAS_VOWEL | HAS_ACCENT, // ό
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ύ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ώ
0x03CF, // Ϗ
0x0392, // ϐ
0x0398, // ϑ
0x03D2, // ϒ
0x03D2 | HAS_ACCENT, // ϓ
0x03D2 | HAS_DIALYTIKA, // ϔ
0x03A6, // ϕ
0x03A0, // ϖ
0x03CF, // ϗ
0x03D8, // Ϙ
0x03D8, // ϙ
0x03DA, // Ϛ
0x03DA, // ϛ
0x03DC, // Ϝ
0x03DC, // ϝ
0x03DE, // Ϟ
0x03DE, // ϟ
0x03E0, // Ϡ
0x03E0, // ϡ
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0x039A, // ϰ
0x03A1, // ϱ
0x03F9, // ϲ
0x037F, // ϳ
0x03F4, // ϴ
0x0395 | HAS_VOWEL, // ϵ
0,
0x03F7, // Ϸ
0x03F7, // ϸ
0x03F9, // Ϲ
0x03FA, // Ϻ
0x03FA, // ϻ
0x03FC, // ϼ
0x03FD, // Ͻ
0x03FE, // Ͼ
0x03FF, // Ͽ
};
private static final char[] data1F00 = {
// U+1F00..1FFF
0x0391 | HAS_VOWEL, // ἀ
0x0391 | HAS_VOWEL, // ἁ
0x0391 | HAS_VOWEL | HAS_ACCENT, // ἂ
0x0391 | HAS_VOWEL | HAS_ACCENT, // ἃ
0x0391 | HAS_VOWEL | HAS_ACCENT, // ἄ
0x0391 | HAS_VOWEL | HAS_ACCENT, // ἅ
0x0391 | HAS_VOWEL | HAS_ACCENT, // ἆ
0x0391 | HAS_VOWEL | HAS_ACCENT, // ἇ
0x0391 | HAS_VOWEL, // Ἀ
0x0391 | HAS_VOWEL, // Ἁ
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἂ
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἃ
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἄ
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἅ
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἆ
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἇ
0x0395 | HAS_VOWEL, // ἐ
0x0395 | HAS_VOWEL, // ἑ
0x0395 | HAS_VOWEL | HAS_ACCENT, // ἒ
0x0395 | HAS_VOWEL | HAS_ACCENT, // ἓ
0x0395 | HAS_VOWEL | HAS_ACCENT, // ἔ
0x0395 | HAS_VOWEL | HAS_ACCENT, // ἕ
0,
0,
0x0395 | HAS_VOWEL, // Ἐ
0x0395 | HAS_VOWEL, // Ἑ
0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἒ
0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἓ
0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἔ
0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἕ
0,
0,
0x0397 | HAS_VOWEL, // ἠ
0x0397 | HAS_VOWEL, // ἡ
0x0397 | HAS_VOWEL | HAS_ACCENT, // ἢ
0x0397 | HAS_VOWEL | HAS_ACCENT, // ἣ
0x0397 | HAS_VOWEL | HAS_ACCENT, // ἤ
0x0397 | HAS_VOWEL | HAS_ACCENT, // ἥ
0x0397 | HAS_VOWEL | HAS_ACCENT, // ἦ
0x0397 | HAS_VOWEL | HAS_ACCENT, // ἧ
0x0397 | HAS_VOWEL, // Ἠ
0x0397 | HAS_VOWEL, // Ἡ
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἢ
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἣ
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἤ
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἥ
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἦ
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἧ
0x0399 | HAS_VOWEL, // ἰ
0x0399 | HAS_VOWEL, // ἱ
0x0399 | HAS_VOWEL | HAS_ACCENT, // ἲ
0x0399 | HAS_VOWEL | HAS_ACCENT, // ἳ
0x0399 | HAS_VOWEL | HAS_ACCENT, // ἴ
0x0399 | HAS_VOWEL | HAS_ACCENT, // ἵ
0x0399 | HAS_VOWEL | HAS_ACCENT, // ἶ
0x0399 | HAS_VOWEL | HAS_ACCENT, // ἷ
0x0399 | HAS_VOWEL, // Ἰ
0x0399 | HAS_VOWEL, // Ἱ
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἲ
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἳ
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἴ
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἵ
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἶ
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἷ
0x039F | HAS_VOWEL, // ὀ
0x039F | HAS_VOWEL, // ὁ
0x039F | HAS_VOWEL | HAS_ACCENT, // ὂ
0x039F | HAS_VOWEL | HAS_ACCENT, // ὃ
0x039F | HAS_VOWEL | HAS_ACCENT, // ὄ
0x039F | HAS_VOWEL | HAS_ACCENT, // ὅ
0,
0,
0x039F | HAS_VOWEL, // Ὀ
0x039F | HAS_VOWEL, // Ὁ
0x039F | HAS_VOWEL | HAS_ACCENT, // Ὂ
0x039F | HAS_VOWEL | HAS_ACCENT, // Ὃ
0x039F | HAS_VOWEL | HAS_ACCENT, // Ὄ
0x039F | HAS_VOWEL | HAS_ACCENT, // Ὅ
0,
0,
0x03A5 | HAS_VOWEL, // ὐ
0x03A5 | HAS_VOWEL, // ὑ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὒ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὓ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὔ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὕ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὖ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὗ
0,
0x03A5 | HAS_VOWEL, // Ὑ
0,
0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὓ
0,
0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὕ
0,
0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὗ
0x03A9 | HAS_VOWEL, // ὠ
0x03A9 | HAS_VOWEL, // ὡ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὢ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὣ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὤ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὥ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὦ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὧ
0x03A9 | HAS_VOWEL, // Ὠ
0x03A9 | HAS_VOWEL, // Ὡ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὢ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὣ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὤ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὥ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὦ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὧ
0x0391 | HAS_VOWEL | HAS_ACCENT, // ὰ
0x0391 | HAS_VOWEL | HAS_ACCENT, // ά
0x0395 | HAS_VOWEL | HAS_ACCENT, // ὲ
0x0395 | HAS_VOWEL | HAS_ACCENT, // έ
0x0397 | HAS_VOWEL | HAS_ACCENT, // ὴ
0x0397 | HAS_VOWEL | HAS_ACCENT, // ή
0x0399 | HAS_VOWEL | HAS_ACCENT, // ὶ
0x0399 | HAS_VOWEL | HAS_ACCENT, // ί
0x039F | HAS_VOWEL | HAS_ACCENT, // ὸ
0x039F | HAS_VOWEL | HAS_ACCENT, // ό
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὺ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ύ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὼ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ώ
0,
0,
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾀ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾁ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾂ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾃ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾄ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾅ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾆ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾇ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾈ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾉ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾊ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾋ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾌ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾍ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾎ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾏ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾐ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾑ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾒ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾓ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾔ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾕ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾖ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾗ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾘ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾙ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾚ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾛ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾜ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾝ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾞ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾟ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾠ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾡ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾢ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾣ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾤ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾥ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾦ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾧ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾨ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾩ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾪ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾫ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾬ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾭ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾮ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾯ
0x0391 | HAS_VOWEL, // ᾰ
0x0391 | HAS_VOWEL, // ᾱ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾲ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾳ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾴ
0,
0x0391 | HAS_VOWEL | HAS_ACCENT, // ᾶ
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾷ
0x0391 | HAS_VOWEL, // Ᾰ
0x0391 | HAS_VOWEL, // Ᾱ
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ὰ
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ά
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾼ
0,
0x0399 | HAS_VOWEL, // ι
0,
0,
0,
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῂ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῃ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῄ
0,
0x0397 | HAS_VOWEL | HAS_ACCENT, // ῆ
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῇ
0x0395 | HAS_VOWEL | HAS_ACCENT, // Ὲ
0x0395 | HAS_VOWEL | HAS_ACCENT, // Έ
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ὴ
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ή
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῌ
0,
0,
0,
0x0399 | HAS_VOWEL, // ῐ
0x0399 | HAS_VOWEL, // ῑ
0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῒ
0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΐ
0,
0,
0x0399 | HAS_VOWEL | HAS_ACCENT, // ῖ
0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῗ
0x0399 | HAS_VOWEL, // Ῐ
0x0399 | HAS_VOWEL, // Ῑ
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ὶ
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ί
0,
0,
0,
0,
0x03A5 | HAS_VOWEL, // ῠ
0x03A5 | HAS_VOWEL, // ῡ
0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῢ
0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΰ
0x03A1, // ῤ
0x03A1, // ῥ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ῦ
0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῧ
0x03A5 | HAS_VOWEL, // Ῠ
0x03A5 | HAS_VOWEL, // Ῡ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὺ
0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ύ
0x03A1, // Ῥ
0,
0,
0,
0,
0,
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῲ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῳ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῴ
0,
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ῶ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῷ
0x039F | HAS_VOWEL | HAS_ACCENT, // Ὸ
0x039F | HAS_VOWEL | HAS_ACCENT, // Ό
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὼ
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ώ
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῼ
0,
0,
0,
};
// U+2126 Ohm sign
private static final char data2126 = 0x03A9 | HAS_VOWEL; // Ω
private static final int getLetterData(int c) {
if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
return 0;
} else if (c <= 0x3ff) {
return data0370[c - 0x370];
} else if (c <= 0x1fff) {
return data1F00[c - 0x1f00];
} else if (c == 0x2126) {
return data2126;
} else {
return 0;
}
}
/**
* Returns a non-zero value for each of the Greek combining diacritics
* listed in The Unicode Standard, version 8, chapter 7.2 Greek,
* plus some perispomeni look-alikes.
*/
private static final int getDiacriticData(int c) {
switch (c) {
case '\u0300': // varia
case '\u0301': // tonos = oxia
case '\u0342': // perispomeni
case '\u0302': // circumflex can look like perispomeni
case '\u0303': // tilde can look like perispomeni
case '\u0311': // inverted breve can look like perispomeni
return HAS_ACCENT;
case '\u0308': // dialytika = diaeresis
return HAS_COMBINING_DIALYTIKA;
case '\u0344': // dialytika tonos
return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
case '\u0345': // ypogegrammeni = iota subscript
return HAS_YPOGEGRAMMENI;
case '\u0304': // macron
case '\u0306': // breve
case '\u0313': // comma above
case '\u0314': // reversed comma above
case '\u0343': // koronis
return HAS_OTHER_GREEK_DIACRITIC;
default:
return 0;
}
}
private static boolean isFollowedByCasedLetter(CharSequence s, int i) {
while (i < s.length()) {
int c = Character.codePointAt(s, i);
int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c);
if ((type & UCaseProps.IGNORABLE) != 0) {
// Case-ignorable, continue with the loop.
} else if (type != UCaseProps.NONE) {
return true; // Followed by cased letter.
} else {
return false; // Uncased and not case-ignorable.
}
}
return false; // Not followed by cased letter.
}
/**
* Greek string uppercasing with a state machine.
* Probably simpler than a stateless function that has to figure out complex context-before
* for each character.
* TODO: Try to re-consolidate one way or another with the non-Greek function.
*
* <p>Keep this consistent with the C++ versions in ustrcase.cpp (UTF-16) and ucasemap.cpp (UTF-8).
*/
private static String toUpper(CharSequence s, int[] locCache) {
StringBuilder result = new StringBuilder(s.length());
int state = 0;
for (int i = 0; i < s.length();) {
int c = Character.codePointAt(s, i);
int nextIndex = i + Character.charCount(c);
int nextState = 0;
int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c);
if ((type & UCaseProps.IGNORABLE) != 0) {
// c is case-ignorable
nextState |= (state & AFTER_CASED);
} else if (type != UCaseProps.NONE) {
// c is cased
nextState |= AFTER_CASED;
}
int data = getLetterData(c);
if (data > 0) {
int upper = data & UPPER_MASK;
// Add a dialytika to this iota or ypsilon vowel
// if we removed a tonos from the previous vowel,
// and that previous vowel did not also have (or gain) a dialytika.
// Adding one only to the final vowel in a longer sequence
// (which does not occur in normal writing) would require lookahead.
// Set the same flag as for preserving an existing dialytika.
if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
(upper == 'Ι' || upper == 'Υ')) {
data |= HAS_DIALYTIKA;
}
int numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
if ((data & HAS_YPOGEGRAMMENI) != 0) {
numYpogegrammeni = 1;
}
// Skip combining diacritics after this Greek letter.
while (nextIndex < s.length()) {
int diacriticData = getDiacriticData(s.charAt(nextIndex));
if (diacriticData != 0) {
data |= diacriticData;
if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
++numYpogegrammeni;
}
++nextIndex;
} else {
break; // not a Greek diacritic
}
}
if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
nextState |= AFTER_VOWEL_WITH_ACCENT;
}
// Map according to Greek rules.
boolean addTonos = false;
if (upper == 'Η' &&
(data & HAS_ACCENT) != 0 &&
numYpogegrammeni == 0 &&
(state & AFTER_CASED) == 0 &&
!isFollowedByCasedLetter(s, nextIndex)) {
// Keep disjunctive "or" with (only) a tonos.
// We use the same "word boundary" conditions as for the Final_Sigma test.
if (i == nextIndex) {
upper = 'Ή'; // Preserve the precomposed form.
} else {
addTonos = true;
}
} else if ((data & HAS_DIALYTIKA) != 0) {
// Preserve a vowel with dialytika in precomposed form if it exists.
if (upper == 'Ι') {
upper = 'Ϊ';
data &= ~HAS_EITHER_DIALYTIKA;
} else if (upper == 'Υ') {
upper = 'Ϋ';
data &= ~HAS_EITHER_DIALYTIKA;
}
}
result.appendCodePoint(upper);
if ((data & HAS_EITHER_DIALYTIKA) != 0) {
result.append('\u0308'); // restore or add a dialytika
}
if (addTonos) {
result.append('\u0301');
}
while (numYpogegrammeni > 0) {
result.append('Ι');
--numYpogegrammeni;
}
} else {
c = UCaseProps.INSTANCE.toFullUpper(c, null, result, null, locCache);
appendResult(c, result);
}
i = nextIndex;
state = nextState;
}
return result.toString();
}
}
}