blob: b846fbee2a6e7bd6cd8cf8eafc749a681dc493ff [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 2009, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.dev.test.translit;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.UnicodeRegex;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UProperty.NameChoice;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* @author markdavis
*/
public class RegexUtilitiesTest extends TestFmwk {
public static void main(String[] args) throws Exception {
new RegexUtilitiesTest().run(args);
}
/**
* Check basic construction.
*/
public void TestConstruction() {
String[][] tests = {
{"a"},
{"a[a-z]b"},
{"[ba-z]", "[a-z]"},
{"q[ba-z]", "q[a-z]"},
{"[ba-z]q", "[a-z]q"},
{"a\\p{joincontrol}b", "a[\u200C\u200D]b"},
{"a\\P{joincontrol}b", "a[^\u200C\u200D]b"},
{"a[[:whitespace:]&[:Zl:]]b", "a[\\\u2028]b"},
{"a [[:bc=cs:]&[:wspace:]] b", "a [\u00A0\u202F] b"},
};
for (int i = 0; i < tests.length; ++i) {
final String source = tests[i][0];
String expected = tests[i].length == 1 ? source : tests[i][1];
String actual = UnicodeRegex.fix(source);
assertEquals(source, expected, actual);
}
}
Transliterator hex = Transliterator.getInstance("hex");
/**
* Perform an exhaustive test on all Unicode characters to make sure that the UnicodeSet with each
* character works.
*/
public void TestCharacters() {
UnicodeSet requiresQuote = new UnicodeSet("[\\$\\&\\-\\:\\[\\\\\\]\\^\\{\\}[:pattern_whitespace:]]");
boolean skip = getInclusion() < 10;
for (int cp = 0; cp < 0x110000; ++cp) {
if (cp > 0xFF && skip && (cp % 37 != 0)) {
continue;
}
String cpString = UTF16.valueOf(cp);
String s = requiresQuote.contains(cp) ? "\\" + cpString : cpString;
String pattern = null;
final String rawPattern = "[" + s + s + "]";
try {
pattern = UnicodeRegex.fix(rawPattern);
} catch (Exception e) {
errln(e.getMessage());
continue;
}
final String expected = "[" + s + "]";
assertEquals("Doubled character works" + hex.transform(s), expected, pattern);
// verify that we can create a regex pattern and use as expected
String shouldNotMatch = UTF16.valueOf((cp + 1) % 0x110000);
checkCharPattern(Pattern.compile(pattern), pattern, cpString, shouldNotMatch);
// verify that the Pattern.compile works
checkCharPattern(UnicodeRegex.compile(rawPattern), pattern, cpString, shouldNotMatch);
}
}
/**
* Check all integer Unicode properties to make sure they work.
*/
public void TestUnicodeProperties() {
final boolean skip = getInclusion() < 10;
UnicodeSet temp = new UnicodeSet();
for (int propNum = UProperty.INT_START; propNum < UProperty.INT_LIMIT; ++propNum) {
if (skip && (propNum % 5 != 0)) {
continue;
}
String propName = UCharacter.getPropertyName(propNum, NameChoice.LONG);
final int intPropertyMinValue = UCharacter.getIntPropertyMinValue(propNum);
int intPropertyMaxValue = UCharacter.getIntPropertyMaxValue(propNum);
if (skip) { // only test first if not exhaustive
intPropertyMaxValue = intPropertyMinValue;
}
for (int valueNum = intPropertyMinValue; valueNum <= intPropertyMaxValue; ++valueNum) {
// hack for getting property value name
String valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.LONG);
if (valueName == null) {
valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.SHORT);
if (valueName == null) {
valueName = Integer.toString(valueNum);
}
}
temp.applyIntPropertyValue(propNum, valueNum);
if (temp.size() == 0) {
continue;
}
final String prefix = "a";
final String suffix = "b";
String shouldMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix;
temp.complement();
String shouldNotMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix;
// posix style pattern
String rawPattern = prefix + "[:" + propName + "=" + valueName + ":]" + suffix;
String rawNegativePattern = prefix + "[:^" + propName + "=" + valueName + ":]" + suffix;
checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch);
checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch);
// perl style pattern
rawPattern = prefix + "\\p{" + propName + "=" + valueName + "}" + suffix;
rawNegativePattern = prefix + "\\P{" + propName + "=" + valueName + "}" + suffix;
checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch);
checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch);
}
}
}
public void TestBnf() {
UnicodeRegex regex = new UnicodeRegex();
final String[][] tests = {
{
"c = a wq;\n" +
"a = xyz;\n" +
"b = a a c;\n"
},
{
"c = a b;\n" +
"a = xyz;\n" +
"b = a a c;\n",
"Exception"
},
{
"uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;\n" +
"scheme = reserved+;\n" +
"host = // reserved+;\n" +
"query = [\\=reserved]+;\n" +
"fragment = reserved+;\n" +
"reserved = [[:ascii:][:sc=grek:]&[:alphabetic:]];\n",
"http://\u03B1\u03B2\u03B3?huh=hi#there"},
{
"langtagRegex.txt"
}
};
for (int i = 0; i < tests.length; ++i) {
String test = tests[i][0];
final boolean expectException = tests[i].length < 2 ? false : tests[i][1].equals("Exception");
try {
String result;
if (test.endsWith(".txt")) {
java.io.InputStream is = RegexUtilitiesTest.class.getResourceAsStream(test);
List lines = UnicodeRegex.appendLines(new ArrayList(), is, "UTF-8");
result = regex.compileBnf(lines);
} else {
result = regex.compileBnf(test);
}
if (expectException) {
errln("Expected exception for " + test);
continue;
}
result = result.replaceAll("[0-9]+%", ""); // just so we can use the language subtag stuff
String resolved = regex.transform(result);
logln(resolved);
Matcher m = Pattern.compile(resolved, Pattern.COMMENTS).matcher("");
String checks = "";
for (int j = 1; j < tests[i].length; ++j) {
String check = tests[i][j];
if (!m.reset(check).matches()) {
checks = checks + "Fails " + check + "\n";
} else {
for (int k = 1; k <= m.groupCount(); ++k) {
checks += "(" + m.group(k) + ")";
}
checks += "\n";
}
}
logln("Result: " + result + "\n" + checks + "\n" + test);
} catch (Exception e) {
if (!expectException) {
errln(e.getClass().getName() + ": " + e.getMessage());
}
continue;
}
}
}
/**
* Utility for checking patterns
*/
private void checkCharPattern(Pattern pat, String matchTitle, String shouldMatch, String shouldNotMatch) {
Matcher matcher = pat.matcher(shouldMatch);
assertTrue(matchTitle + " and " + shouldMatch, matcher.matches());
matcher.reset(shouldNotMatch);
assertFalse(matchTitle + " and " + shouldNotMatch, matcher.matches());
}
}