blob: 78c925b24dc3fb9aed4731188e333dac03d4a12d [file] [log] [blame]
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/**
*******************************************************************************
* Copyright (C) 1996-2016, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.dev.test.lang;
import java.util.BitSet;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.lang.UScript.ScriptUsage;
import com.ibm.icu.text.UnicodeSet;
@RunWith(JUnit4.class)
public class TestUScript extends TestFmwk {
/**
* Constructor
*/
public TestUScript()
{
}
@Test
public void TestGetScriptOfCharsWithScriptExtensions() {
/* test characters which have Script_Extensions */
if(!(
UScript.COMMON==UScript.getScript(0x0640) &&
UScript.INHERITED==UScript.getScript(0x0650) &&
UScript.ARABIC==UScript.getScript(0xfdf2))
) {
errln("UScript.getScript(character with Script_Extensions) failed");
}
}
@Test
public void TestHasScript() {
if(!(
!UScript.hasScript(0x063f, UScript.COMMON) &&
UScript.hasScript(0x063f, UScript.ARABIC) && /* main Script value */
!UScript.hasScript(0x063f, UScript.SYRIAC) &&
!UScript.hasScript(0x063f, UScript.THAANA))
) {
errln("UScript.hasScript(U+063F, ...) is wrong");
}
if(!(
!UScript.hasScript(0x0640, UScript.COMMON) && /* main Script value */
UScript.hasScript(0x0640, UScript.ARABIC) &&
UScript.hasScript(0x0640, UScript.SYRIAC) &&
!UScript.hasScript(0x0640, UScript.THAANA))
) {
errln("UScript.hasScript(U+0640, ...) is wrong");
}
if(!(
!UScript.hasScript(0x0650, UScript.INHERITED) && /* main Script value */
UScript.hasScript(0x0650, UScript.ARABIC) &&
UScript.hasScript(0x0650, UScript.SYRIAC) &&
!UScript.hasScript(0x0650, UScript.THAANA))
) {
errln("UScript.hasScript(U+0650, ...) is wrong");
}
if(!(
!UScript.hasScript(0x0660, UScript.COMMON) && /* main Script value */
UScript.hasScript(0x0660, UScript.ARABIC) &&
!UScript.hasScript(0x0660, UScript.SYRIAC) &&
UScript.hasScript(0x0660, UScript.THAANA))
) {
errln("UScript.hasScript(U+0660, ...) is wrong");
}
if(!(
!UScript.hasScript(0xfdf2, UScript.COMMON) &&
UScript.hasScript(0xfdf2, UScript.ARABIC) && /* main Script value */
!UScript.hasScript(0xfdf2, UScript.SYRIAC) &&
UScript.hasScript(0xfdf2, UScript.THAANA))
) {
errln("UScript.hasScript(U+FDF2, ...) is wrong");
}
if(UScript.hasScript(0x0640, 0xaffe)) {
// An unguarded implementation might go into an infinite loop.
errln("UScript.hasScript(U+0640, bogus 0xaffe) is wrong");
}
}
@Test
public void TestGetScriptExtensions() {
BitSet scripts=new BitSet(UScript.CODE_LIMIT);
/* invalid code points */
if(UScript.getScriptExtensions(-1, scripts)!=UScript.UNKNOWN || scripts.cardinality()!=1 ||
!scripts.get(UScript.UNKNOWN)) {
errln("UScript.getScriptExtensions(-1) is not {UNKNOWN}");
}
if(UScript.getScriptExtensions(0x110000, scripts)!=UScript.UNKNOWN || scripts.cardinality()!=1 ||
!scripts.get(UScript.UNKNOWN)) {
errln("UScript.getScriptExtensions(0x110000) is not {UNKNOWN}");
}
/* normal usage */
if(UScript.getScriptExtensions(0x063f, scripts)!=UScript.ARABIC || scripts.cardinality()!=1 ||
!scripts.get(UScript.ARABIC)) {
errln("UScript.getScriptExtensions(U+063F) is not {ARABIC}");
}
if(UScript.getScriptExtensions(0x0640, scripts)>-3 || scripts.cardinality()<3 ||
!scripts.get(UScript.ARABIC) || !scripts.get(UScript.SYRIAC) || !scripts.get(UScript.MANDAIC)
) {
errln("UScript.getScriptExtensions(U+0640) failed");
}
if(UScript.getScriptExtensions(0xfdf2, scripts)!=-2 || scripts.cardinality()!=2 ||
!scripts.get(UScript.ARABIC) || !scripts.get(UScript.THAANA)) {
errln("UScript.getScriptExtensions(U+FDF2) failed");
}
if(UScript.getScriptExtensions(0xff65, scripts)!=-6 || scripts.cardinality()!=6 ||
!scripts.get(UScript.BOPOMOFO) || !scripts.get(UScript.YI)) {
errln("UScript.getScriptExtensions(U+FF65) failed");
}
}
@Test
public void TestDefaultScriptExtensions() {
// Block 3000..303F CJK Symbols and Punctuation defaults to scx=Bopo Hang Hani Hira Kana Yiii
// but some of its characters revert to scx=<script> which is usually Common.
BitSet scx = new BitSet();
assertEquals("U+3000 num scx", // IDEOGRAPHIC SPACE
UScript.COMMON,
UScript.getScriptExtensions(0x3000, scx));
scx.clear();
assertEquals("U+3012 num scx", // POSTAL MARK
UScript.COMMON,
UScript.getScriptExtensions(0x3012, scx));
}
@Test
public void TestScriptMetadataAPI() {
/* API & code coverage. */
String sample = UScript.getSampleString(UScript.LATIN);
if(sample.length()!=1 || UScript.getScript(sample.charAt(0))!=UScript.LATIN) {
errln("UScript.getSampleString(Latn) failed");
}
sample = UScript.getSampleString(UScript.INVALID_CODE);
if(sample.length()!=0) {
errln("UScript.getSampleString(invalid) failed");
}
if(UScript.getUsage(UScript.LATIN)!=ScriptUsage.RECOMMENDED ||
// Unicode 10 gives up on "aspirational".
UScript.getUsage(UScript.YI)!=ScriptUsage.LIMITED_USE ||
UScript.getUsage(UScript.CHEROKEE)!=ScriptUsage.LIMITED_USE ||
UScript.getUsage(UScript.COPTIC)!=ScriptUsage.EXCLUDED ||
UScript.getUsage(UScript.CIRTH)!=ScriptUsage.NOT_ENCODED ||
UScript.getUsage(UScript.INVALID_CODE)!=ScriptUsage.NOT_ENCODED ||
UScript.getUsage(UScript.CODE_LIMIT)!=ScriptUsage.NOT_ENCODED) {
errln("UScript.getUsage() failed");
}
if(UScript.isRightToLeft(UScript.LATIN) ||
UScript.isRightToLeft(UScript.CIRTH) ||
!UScript.isRightToLeft(UScript.ARABIC) ||
!UScript.isRightToLeft(UScript.HEBREW)) {
errln("UScript.isRightToLeft() failed");
}
if(UScript.breaksBetweenLetters(UScript.LATIN) ||
UScript.breaksBetweenLetters(UScript.CIRTH) ||
!UScript.breaksBetweenLetters(UScript.HAN) ||
!UScript.breaksBetweenLetters(UScript.THAI)) {
errln("UScript.breaksBetweenLetters() failed");
}
if(UScript.isCased(UScript.CIRTH) ||
UScript.isCased(UScript.HAN) ||
!UScript.isCased(UScript.LATIN) ||
!UScript.isCased(UScript.GREEK)) {
errln("UScript.isCased() failed");
}
}
/**
* Maps a special script code to the most common script of its encoded characters.
*/
private static final int getCharScript(int script) {
switch(script) {
case UScript.HAN_WITH_BOPOMOFO:
case UScript.SIMPLIFIED_HAN:
case UScript.TRADITIONAL_HAN:
return UScript.HAN;
case UScript.JAPANESE:
return UScript.HIRAGANA;
case UScript.JAMO:
case UScript.KOREAN:
return UScript.HANGUL;
case UScript.SYMBOLS_EMOJI:
return UScript.SYMBOLS;
default:
return script;
}
}
@Test
public void TestScriptMetadata() {
UnicodeSet rtl = new UnicodeSet("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]");
// So far, sample characters are uppercase.
// Georgian is special.
UnicodeSet cased = new UnicodeSet("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]");
for(int sc = 0; sc < UScript.CODE_LIMIT; ++sc) {
String sn = UScript.getShortName(sc);
ScriptUsage usage = UScript.getUsage(sc);
String sample = UScript.getSampleString(sc);
UnicodeSet scriptSet = new UnicodeSet();
scriptSet.applyIntPropertyValue(UProperty.SCRIPT, sc);
if(usage == ScriptUsage.NOT_ENCODED) {
assertTrue(sn + " not encoded, no sample", sample.isEmpty());
assertFalse(sn + " not encoded, not RTL", UScript.isRightToLeft(sc));
assertFalse(sn + " not encoded, not LB letters", UScript.breaksBetweenLetters(sc));
assertFalse(sn + " not encoded, not cased", UScript.isCased(sc));
assertTrue(sn + " not encoded, no characters", scriptSet.isEmpty());
} else {
assertFalse(sn + " encoded, has a sample character", sample.isEmpty());
int firstChar = sample.codePointAt(0);
int charScript = getCharScript(sc);
assertEquals(sn + " script(sample(script))",
charScript, UScript.getScript(firstChar));
assertEquals(sn + " RTL vs. set", rtl.contains(firstChar), UScript.isRightToLeft(sc));
assertEquals(sn + " cased vs. set", cased.contains(firstChar), UScript.isCased(sc));
assertEquals(sn + " encoded, has characters", sc == charScript, !scriptSet.isEmpty());
if(UScript.isRightToLeft(sc)) {
rtl.removeAll(scriptSet);
}
if(UScript.isCased(sc)) {
cased.removeAll(scriptSet);
}
}
}
assertEquals("no remaining RTL characters", "[]", rtl.toPattern(true));
assertEquals("no remaining cased characters", "[]", cased.toPattern(true));
assertTrue("Hani breaks between letters", UScript.breaksBetweenLetters(UScript.HAN));
assertTrue("Thai breaks between letters", UScript.breaksBetweenLetters(UScript.THAI));
assertFalse("Latn does not break between letters", UScript.breaksBetweenLetters(UScript.LATIN));
}
@Test
public void TestScriptNames(){
for(int i=0; i<UScript.CODE_LIMIT;i++){
String name = UScript.getName(i);
if(name.equals("") ){
errln("FAILED: getName for code : "+i);
}
String shortName= UScript.getShortName(i);
if(shortName.equals("")){
errln("FAILED: getName for code : "+i);
}
}
}
@Test
public void TestAllCodepoints(){
int code;
//String oldId="";
//String oldAbbrId="";
for( int i =0; i <= 0x10ffff; i++){
code =UScript.INVALID_CODE;
code = UScript.getScript(i);
if(code==UScript.INVALID_CODE){
errln("UScript.getScript for codepoint 0x"+ hex(i)+" failed");
}
String id =UScript.getName(code);
if(id.indexOf("INVALID")>=0){
errln("UScript.getScript for codepoint 0x"+ hex(i)+" failed");
}
String abbr = UScript.getShortName(code);
if(abbr.indexOf("INV")>=0){
errln("UScript.getScript for codepoint 0x"+ hex(i)+" failed");
}
}
}
@Test
public void TestNewCode(){
/*
* These script codes were originally added to ICU pre-3.6, so that ICU would
* have all ISO 15924 script codes. ICU was then based on Unicode 4.1.
* These script codes were added with only short names because we don't
* want to invent long names ourselves.
* Unicode 5 and later encode some of these scripts and give them long names.
* Whenever this happens, the long script names here need to be updated.
*/
String[] expectedLong = new String[]{
"Balinese", "Batak", "Blis", "Brahmi", "Cham", "Cirt", "Cyrs",
"Egyd", "Egyh", "Egyptian_Hieroglyphs",
"Geok", "Hans", "Hant", "Pahawh_Hmong", "Old_Hungarian", "Inds",
"Javanese", "Kayah_Li", "Latf", "Latg",
"Lepcha", "Linear_A", "Mandaic", "Maya", "Meroitic_Hieroglyphs",
"Nko", "Old_Turkic", "Old_Permic", "Phags_Pa", "Phoenician",
"Miao", "Roro", "Sara", "Syre", "Syrj", "Syrn", "Teng", "Vai", "Visp", "Cuneiform",
"Zxxx", "Unknown",
"Carian", "Jpan", "Tai_Tham", "Lycian", "Lydian", "Ol_Chiki", "Rejang", "Saurashtra", "SignWriting", "Sundanese",
"Moon", "Meetei_Mayek",
/* new in ICU 4.0 */
"Imperial_Aramaic", "Avestan", "Chakma", "Kore",
"Kaithi", "Manichaean", "Inscriptional_Pahlavi", "Psalter_Pahlavi", "Phlv",
"Inscriptional_Parthian", "Samaritan", "Tai_Viet",
"Zmth", "Zsym",
/* new in ICU 4.4 */
"Bamum", "Lisu", "Nkgb", "Old_South_Arabian",
/* new in ICU 4.6 */
"Bassa_Vah", "Duployan", "Elbasan", "Grantha", "Kpel",
"Loma", "Mende_Kikakui", "Meroitic_Cursive",
"Old_North_Arabian", "Nabataean", "Palmyrene", "Khudawadi", "Warang_Citi",
/* new in ICU 4.8 */
"Afak", "Jurc", "Mro", "Nushu", "Sharada", "Sora_Sompeng", "Takri", "Tangut", "Wole",
/* new in ICU 49 */
"Anatolian_Hieroglyphs", "Khojki", "Tirhuta",
/* new in ICU 52 */
"Caucasian_Albanian", "Mahajani",
/* new in ICU 54 */
"Ahom", "Hatran", "Modi", "Multani", "Pau_Cin_Hau", "Siddham",
// new in ICU 58
"Adlam", "Bhaiksuki", "Marchen", "Newa", "Osage", "Hanb", "Jamo", "Zsye",
// new in ICU 60
"Masaram_Gondi", "Soyombo", "Zanabazar_Square",
// new in ICU 61
"Dogra", "Gunjala_Gondi", "Makasar", "Medefaidrin",
"Hanifi_Rohingya", "Sogdian", "Old_Sogdian",
// new in ICU 64
"Elymaic", "Nyiakeng_Puachue_Hmong", "Nandinagari", "Wancho",
// new in ICU 66
"Chorasmian", "Dives_Akuru", "Khitan_Small_Script", "Yezidi",
};
String[] expectedShort = new String[]{
"Bali", "Batk", "Blis", "Brah", "Cham", "Cirt", "Cyrs", "Egyd", "Egyh", "Egyp",
"Geok", "Hans", "Hant", "Hmng", "Hung", "Inds", "Java", "Kali", "Latf", "Latg",
"Lepc", "Lina", "Mand", "Maya", "Mero", "Nkoo", "Orkh", "Perm", "Phag", "Phnx",
"Plrd", "Roro", "Sara", "Syre", "Syrj", "Syrn", "Teng", "Vaii", "Visp", "Xsux",
"Zxxx", "Zzzz",
"Cari", "Jpan", "Lana", "Lyci", "Lydi", "Olck", "Rjng", "Saur", "Sgnw", "Sund",
"Moon", "Mtei",
/* new in ICU 4.0 */
"Armi", "Avst", "Cakm", "Kore",
"Kthi", "Mani", "Phli", "Phlp", "Phlv", "Prti", "Samr", "Tavt",
"Zmth", "Zsym",
/* new in ICU 4.4 */
"Bamu", "Lisu", "Nkgb", "Sarb",
/* new in ICU 4.6 */
"Bass", "Dupl", "Elba", "Gran", "Kpel", "Loma", "Mend", "Merc",
"Narb", "Nbat", "Palm", "Sind", "Wara",
/* new in ICU 4.8 */
"Afak", "Jurc", "Mroo", "Nshu", "Shrd", "Sora", "Takr", "Tang", "Wole",
/* new in ICU 49 */
"Hluw", "Khoj", "Tirh",
/* new in ICU 52 */
"Aghb", "Mahj",
/* new in ICU 54 */
"Ahom", "Hatr", "Modi", "Mult", "Pauc", "Sidd",
// new in ICU 58
"Adlm", "Bhks", "Marc", "Newa", "Osge", "Hanb", "Jamo", "Zsye",
// new in ICU 60
"Gonm", "Soyo", "Zanb",
// new in ICU 61
"Dogr", "Gong", "Maka", "Medf", "Rohg", "Sogd", "Sogo",
// new in ICU 64
"Elym", "Hmnp", "Nand", "Wcho",
// new in ICU 66
"Chrs", "Diak", "Kits", "Yezi",
};
if(expectedLong.length!=(UScript.CODE_LIMIT-UScript.BALINESE)) {
errln("need to add new script codes in lang.TestUScript.java!");
return;
}
int j = 0;
int i = 0;
for(i=UScript.BALINESE; i<UScript.CODE_LIMIT; i++, j++){
String name = UScript.getName(i);
if(name==null || !name.equals(expectedLong[j])){
errln("UScript.getName failed for code"+ i + name +"!=" +expectedLong[j]);
}
name = UScript.getShortName(i);
if(name==null || !name.equals(expectedShort[j])){
errln("UScript.getShortName failed for code"+ i + name +"!=" +expectedShort[j]);
}
}
for(i=0; i<expectedLong.length; i++){
int[] ret = UScript.getCode(expectedShort[i]);
if(ret.length>1){
errln("UScript.getCode did not return expected number of codes for script"+ expectedShort[i]+". EXPECTED: 1 GOT: "+ ret.length);
}
if(ret[0]!= (UScript.BALINESE+i)){
errln("UScript.getCode did not return expected code for script"+ expectedShort[i]+". EXPECTED: "+ (UScript.BALINESE+i)+" GOT: %i\n"+ ret[0] );
}
}
}
}