| /** |
| ******************************************************************************* |
| * Copyright (C) 1996-2001, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $ |
| * $Date: 2006/09/24 23:32:44 $ |
| * $Revision: 1.15 $ |
| * |
| ******************************************************************************* |
| */ |
| |
| package com.ibm.text.UCD; |
| |
| import java.util.*; |
| import java.io.*; |
| |
| import org.unicode.cldr.util.Segmenter; |
| |
| import com.ibm.text.utility.*; |
| import com.ibm.icu.dev.test.util.UnicodeMap; |
| import com.ibm.icu.dev.test.util.UnicodeProperty; |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.text.UnicodeSet; |
| |
| abstract public class GenerateBreakTest implements UCD_Types { |
| |
| static boolean DEBUG = true; |
| static final boolean SHOW_TYPE = false; |
| UCD ucd; |
| Normalizer nfd; |
| Normalizer nfkd; |
| |
| OldUnicodeMap sampleMap = null; |
| OldUnicodeMap map = new OldUnicodeMap(); |
| UnicodeProperty prop; |
| |
| // ====================== Main =========================== |
| |
| public static void main(String[] args) throws IOException { |
| System.out.println("Remember to add length marks (half & full) and other punctuation for sentence, with FF61"); |
| //Default.setUCD(); |
| new GenerateGraphemeBreakTest(Default.ucd()).run(); |
| new GenerateWordBreakTest(Default.ucd()).run(); |
| new GenerateLineBreakTest(Default.ucd()).run(); |
| new GenerateSentenceBreakTest(Default.ucd()).run(); |
| } |
| |
| GenerateBreakTest(UCD ucd) { |
| this.ucd = ucd; |
| nfd = new Normalizer(Normalizer.NFD, ucd.getVersion()); |
| nfkd = new Normalizer(Normalizer.NFKD, ucd.getVersion()); |
| /* |
| public void fillMap(String propName) { |
| List list = y.getAvailableValues(); |
| for (Iterator it = list.iterator(); it.hasNext();) { |
| String label = (String) it.next(); |
| map.add(label, y.getSet(label)); |
| } |
| } |
| */ |
| } |
| |
| ToolUnicodePropertySource unicodePropertySource = ToolUnicodePropertySource.make(""); |
| |
| Set labels = new HashSet(); |
| |
| int addToMap(String label) { |
| labels.add(label); |
| UnicodeSet s = prop.getSet(label); |
| if (s == null || s.size() == 0) throw new IllegalArgumentException("Bad value: " + prop.getName() + ", " + label); |
| return map.add(label, s); |
| } |
| |
| int addToMapLast(String label) { |
| int result = addToMap(label); |
| Set values = new HashSet(prop.getAvailableValues()); |
| if (!values.equals(labels)) throw new IllegalArgumentException("Missing Property Values: " + prop.getName() |
| + ": " + values.removeAll(labels)); |
| return result; |
| } |
| |
| // COMMON STUFF for Hangul |
| /* |
| static final byte hNot = -1, hL = 0, hV = 1, hT = 2, hLV = 3, hLVT = 4, hLIMIT = 5; |
| static final String[] hNames = {"L", "V", "T", "LV", "LVT"}; |
| |
| |
| static byte getHangulType(int cp) { |
| if (ucd.isLeadingJamo(cp)) return hL; |
| if (ucd.isVowelJamo(cp)) return hV; |
| if (ucd.isTrailingJamo(cp)) return hT; |
| if (ucd.isHangulSyllable(cp)) { |
| if (ucd.isDoubleHangul(cp)) return hLV; |
| return hLVT; |
| } |
| return hNot; |
| } |
| */ |
| |
| /* static { |
| setUCD(); |
| } |
| */ |
| |
| public static boolean onCodepointBoundary(String s, int offset) { |
| if (offset < 0 || offset > s.length()) return false; |
| if (offset == 0 || offset == s.length()) return true; |
| if (UTF16.isLeadSurrogate(s.charAt(offset-1)) |
| && UTF16.isTrailSurrogate(s.charAt(offset))) return false; |
| return true; |
| } |
| |
| // finds the first base character, or the first character if there is no base |
| public int findFirstBase(String source, int start, int limit) { |
| int cp; |
| for (int i = start; i < limit; i += UTF16.getCharCount(cp)) { |
| cp = UTF16.charAt(source, i); |
| byte cat = ucd.getCategory(cp); |
| if (((1<<cat) & MARK_MASK) != 0) continue; |
| return cp; |
| } |
| return UTF16.charAt(source, start); |
| } |
| |
| // quick & dirty routine |
| static String insertEverywhere(String source, String insertion, GenerateBreakTest breaker) { |
| String result = insertion; |
| for (int i = 0; i < source.length(); ++i) { |
| result += source.charAt(i); |
| if (breaker.isBreak(source, i)) { |
| result += insertion; |
| } |
| } |
| return result + insertion; |
| } |
| |
| |
| static void checkDecomps(UCD ucd) { |
| UCDProperty[] INFOPROPS = {UnifiedProperty.make(CATEGORY), UnifiedProperty.make(LINE_BREAK)}; |
| GenerateBreakTest[] tests = { |
| new GenerateGraphemeBreakTest(ucd), |
| new GenerateWordBreakTest(ucd), |
| new GenerateLineBreakTest(ucd), |
| }; |
| tests[0].isBreak("\u0300\u0903", 1); |
| Normalizer nfd = new Normalizer(Normalizer.NFD, ucd.getVersion()); |
| |
| System.out.println("Check Decomps"); |
| //System.out.println("otherExtendSet: " + ((GenerateGraphemeBreakTest)tests[0]).otherExtendSet.toPattern(true)); |
| //Utility.showSetNames("", ((GenerateGraphemeBreakTest)tests[0]).otherExtendSet, false, ucd); |
| |
| for (int k = 0; k < tests.length; ++k) { |
| for (int i = 0; i < 0x10FFFF; ++i) { |
| if (!ucd.isAllocated(i)) continue; |
| if (ucd.isHangulSyllable(i)) continue; |
| if (nfd.isNormalized(i)) continue; |
| String decomp = nfd.normalize(i); |
| boolean shown = false; |
| String test = decomp; |
| for (int j = 1; j < test.length(); ++j) { |
| if (tests[k].isBreak(test, j)) { |
| if (!shown) { |
| System.out.println(showData(ucd, UTF16.valueOf(i), INFOPROPS, "\r\n\t")); |
| System.out.println(" => " + showData(ucd, decomp, INFOPROPS, "\r\n\t")); |
| shown = true; |
| } |
| System.out.println(j + ": " + tests[k].fileName); |
| } |
| } |
| } |
| } |
| } |
| |
| static String showData(UCD ucd, String source, UCDProperty[] props, String separator) { |
| StringBuffer result = new StringBuffer(); |
| int cp; |
| for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { |
| cp = UTF16.charAt(source, i); |
| if (i != 0) result.append(separator); |
| result.append(ucd.getCodeAndName(cp)); |
| for (int j = 0; j < props.length; ++j) { |
| result.append(", "); |
| result.append(props[j].getPropertyName(SHORT)).append('=').append(props[j].getValue(cp,SHORT)); |
| } |
| } |
| return result.toString(); |
| } |
| |
| void showSet(String title, UnicodeSet set) { |
| System.out.println(title + ": " + set.toPattern(true)); |
| Utility.showSetNames("", set, false, ucd); |
| } |
| |
| // determines if string is of form Base NSM* |
| boolean isBaseNSMStar(String source) { |
| int cp; |
| int status = 0; |
| for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { |
| cp = UTF16.charAt(source, i); |
| byte cat = ucd.getCategory(cp); |
| int catMask = 1<<cat; |
| switch(status) { |
| case 0: if ((catMask & BASE_MASK) == 0) return false; |
| status = 1; |
| break; |
| case 1: if ((catMask & NONSPACING_MARK_MASK) == 0) return false; |
| break; |
| } |
| |
| } |
| return true; |
| } |
| |
| UnicodeSet getClosure(UnicodeSet source) { |
| UnicodeSet result = new UnicodeSet(source); |
| for (int i = 0; i < 0x10FFFF; ++i) { |
| if (!ucd.isAllocated(i)) continue; |
| if (nfkd.isNormalized(i)) continue; |
| String decomp = nfkd.normalize(i); |
| if (source.containsAll(decomp)) result.add(i); |
| } |
| return result; |
| } |
| |
| |
| /* |
| static UnicodeSet extraAlpha = new UnicodeSet("[\\u02B9-\\u02BA\\u02C2-\\u02CF\\u02D2-\\u02DF\\u02E5-\\u02ED\\u05F3]"); |
| static UnicodeSet alphabeticSet = UnifiedBinaryProperty.make(DERIVED | PropAlphabetic).getSet() |
| .addAll(extraAlpha); |
| |
| static UnicodeSet ideographicSet = UnifiedBinaryProperty.make(BINARY_PROPERTIES | Ideographic).getSet(); |
| |
| static { |
| if (false) System.out.println("alphabetic: " + alphabeticSet.toPattern(true)); |
| } |
| */ |
| |
| |
| void generateTerminalClosure() { |
| UnicodeSet midLetterSet = new UnicodeSet("[\u0027\u002E\u003A\u00AD\u05F3\u05F4\u2019\uFE52\uFE55\uFF07\uFF0E\uFF1A]"); |
| |
| UnicodeSet ambigSentPunct = new UnicodeSet("[\u002E\u0589\u06D4]"); |
| |
| UnicodeSet sentPunct = new UnicodeSet("[\u0021\u003F\u0387\u061F\u0964\u203C\u203D\u2048\u2049" |
| + "\u3002\ufe52\ufe57\uff01\uff0e\uff1f\uff61]"); |
| |
| UnicodeSet terminals = UnifiedBinaryProperty.make(BINARY_PROPERTIES | Terminal_Punctuation).getSet(); |
| UnicodeSet extras = getClosure(terminals).removeAll(terminals); |
| System.out.println("Current Terminal_Punctuation"); |
| Utility.showSetNames("", terminals, true, ucd); |
| |
| System.out.println("Missing Terminal_Punctuation"); |
| Utility.showSetNames("", extras, true, ucd); |
| |
| System.out.println("midLetterSet"); |
| System.out.println(midLetterSet.toPattern(true)); |
| Utility.showSetNames("", midLetterSet, true, ucd); |
| |
| System.out.println("ambigSentPunct"); |
| System.out.println(ambigSentPunct.toPattern(true)); |
| Utility.showSetNames("", ambigSentPunct, true, ucd); |
| |
| System.out.println("sentPunct"); |
| System.out.println(sentPunct.toPattern(true)); |
| Utility.showSetNames("", sentPunct, true, ucd); |
| /* |
| |
| UnicodeSet sentencePunctuation = new UnicodeSet("[\u0021\003F ; Terminal_Punctuation # Po QUESTION MARK |
| 037E ; Terminal_Punctuation # Po GREEK QUESTION MARK |
| 061F ; Terminal_Punctuation # Po ARABIC QUESTION MARK |
| 06D4 ; Terminal_Punctuation # Po ARABIC FULL STOP |
| 203C..203D ; Terminal_Punctuation # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG |
| 3002 ; Terminal_Punctuation # Po IDEOGRAPHIC FULL STOP |
| 2048..2049 ; Terminal_Punctuation # Po [2] QUESTION EXCLAMATION MARK..EXCLAMATION QUESTION MARK |
| */ |
| |
| } |
| |
| //============================ |
| |
| protected String currentRule; |
| protected String fileName; |
| protected String[] samples = new String[100]; |
| protected String[] extraSamples = new String[0]; |
| protected String[] extraSingleSamples = new String[0]; |
| protected int sampleLimit = 0; |
| protected int tableLimit = -1; |
| |
| protected int[] skippedSamples = new int[100]; |
| protected boolean didSkipSamples = false; |
| |
| private String[] ruleList = new String[100]; |
| private int ruleListCount = 0; |
| protected boolean collectingRules = false; |
| protected boolean needsFullBreakSample = true; |
| |
| public void setRule(String rule) { |
| if (collectingRules) { |
| ruleList[ruleListCount++] = rule; |
| } |
| currentRule = rule; |
| } |
| |
| public String getRule() { |
| return currentRule; |
| } |
| |
| public void run() throws IOException { |
| findSamples(); |
| |
| // test individual cases |
| //printLine(out, samples[LB_ZW], "", samples[LB_CL]); |
| //printLine(out, samples[LB_ZW], " ", samples[LB_CL]); |
| |
| UnicodeDataFile fc = UnicodeDataFile.openHTMLAndWriteHeader("DerivedData\\auxiliary\\", fileName + "BreakTest"); |
| PrintWriter out = fc.out; |
| |
| /* PrintWriter out = Utility.openPrintWriter("auxiliary\\" |
| + fileName + "BreakTest-" |
| + ucd.getVersion() |
| + ".html", Utility.UTF8_WINDOWS); |
| */ |
| out.println("<!doctype HTML PUBLIC '-//W3C//DTD HTML 4.0 Transitional//EN' 'http://www.w3.org/TR/REC-html40/loose.dtd'>"); |
| out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>"); |
| out.println("<title>" + fileName + " Break Chart</title>"); |
| out.println("<style type='text/css'>"); |
| out.println("td, th { vertical-align: top }"); |
| out.println("</style></head>"); |
| |
| |
| out.println("<body bgcolor='#FFFFFF'>"); |
| out.println("<h2>" + fileName + " Break Chart</h2>"); |
| out.println("<p><b>Unicode Version:</b> " + ucd.getVersion() + "</p>"); |
| out.println("<p><b>Date:</b> " + Default.getDate() + "</p>"); |
| out.println("<p>This page illustrates the application of the boundary specifications. " + |
| "The first chart shows where breaks would appear between different sample characters or strings. " + |
| "The sample characters are chosen mechanically to represent the different properties used by the specification. " + |
| "Where properties used in the rules have 'overlaps', the samples are given 'composed' names. " + |
| "For example, SentenceBreak uses GCLF_Sep: Sep is the SentenceBreak property, but it overlaps with the GraphemeClusterBreak property LF." + |
| "</p>"); |
| generateTable(out); |
| |
| |
| if (false) { |
| out.println("<h3>Character Type Breakdown</h3>"); |
| out.println("<table border='1' cellspacing='0' width='100%'>"); |
| for (int i = 0; i < sampleMap.size(); ++i) { |
| out.println("<tr><th>" + sampleMap.getLabelFromIndex(i) |
| + "</th><td>" + sampleMap.getSetFromIndex(i) |
| + "</td></tr>"); |
| } |
| out.println("</table>"); |
| } |
| |
| fc.close(); |
| |
| generateTest(false); |
| |
| } |
| |
| public void generateTest(boolean shortVersion) throws IOException { |
| String[] testCase = new String[50]; |
| // do main test |
| |
| UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader("DerivedData\\auxiliary\\", fileName + "BreakTest" |
| + (shortVersion ? "_SHORT" : "")); |
| PrintWriter out = fc.out; |
| /* PrintWriter out = Utility.openPrintWriter("TR29\\" + fileName + "BreakTest" |
| + (shortVersion ? "_SHORT" : "") |
| + "-" + ucd.getVersion() |
| + ".txt", Utility.UTF8_WINDOWS); |
| */ |
| int counter = 0; |
| |
| out.println("#"); |
| out.println("# Default " + fileName + " Break Test"); |
| out.println("#"); |
| out.println("# Format:"); |
| out.println("# <string> (# <comment>)? "); |
| out.println("# <string> contains hex Unicode code points, with "); |
| out.println("#\t" + BREAK + " wherever there is a break opportunity, and "); |
| out.println("#\t" + NOBREAK + " wherever there is not."); |
| out.println("# <comment> the format can change, but currently it shows:"); |
| out.println("#\t- the sample character name"); |
| out.println("#\t- (x) the line_break property* for the sample character"); |
| out.println("#\t- [x] the rule that determines whether there is a break or not"); |
| out.println("#"); |
| sampleDescription(out); |
| out.println("# These samples may be extended or changed in the future."); |
| out.println("#"); |
| |
| for (int ii = 0; ii < sampleLimit; ++ii) { |
| String before = samples[ii]; |
| |
| for (int jj = 0; jj < sampleLimit; ++jj) { |
| Utility.dot(counter); |
| String after = samples[jj]; |
| |
| // do line straight |
| int len = genTestItems(before, after, testCase); |
| for (int q = 0; q < len; ++q) { |
| printLine(out, testCase[q], !shortVersion && q == 0, false); |
| ++counter; |
| } |
| } |
| } |
| |
| for (int ii = 0; ii < extraSingleSamples.length; ++ii) { |
| printLine(out, extraSingleSamples[ii], true, false); |
| } |
| out.println("# Lines: " + counter); |
| fc.close(); |
| } |
| |
| public void sampleDescription(PrintWriter out) {} |
| |
| abstract public boolean isBreak(String source, int offset); |
| |
| abstract public String fullBreakSample(); |
| |
| abstract public byte getType (int cp); |
| |
| public byte getSampleType (int cp) { |
| return getType(cp); |
| } |
| |
| public int mapType(int input) { |
| return input; |
| } |
| |
| public boolean highlightTableEntry(int x, int y, String s) { |
| return false; |
| } |
| |
| abstract public String getTypeID(int s); |
| |
| public String getTypeID(String s) { |
| if (s == null) return "<null>"; |
| if (s.length() == 1) return getTypeID(s.charAt(0)); |
| StringBuffer result = new StringBuffer(); |
| int cp; |
| for (int i = 0; i < s.length(); i += UTF32.count16(cp)) { |
| cp = UTF32.char32At(s, i); |
| if (i > 0) result.append(" "); |
| result.append(getTypeID(cp)); |
| } |
| return result.toString(); |
| } |
| |
| static final int DONE = -1; |
| |
| public int next(String source, int offset) { |
| for (int i = offset + 1; i <= source.length(); ++i) { |
| if (isBreak(source, i)) return i; |
| } |
| return DONE; |
| } |
| |
| public int previous(String source, int offset) { |
| for (int i = offset - 1; i >= 0; --i) { |
| if (isBreak(source, i)) return i; |
| } |
| return DONE; |
| } |
| |
| public int genTestItems(String before, String after, String[] results) { |
| results[0] = before + after; |
| return 1; |
| } |
| |
| public String getTableEntry(String before, String after, String[] ruleOut) { |
| boolean normalBreak = isBreak(before + after, before.length()); |
| String normalRule = getRule(); |
| ruleOut[0] = normalRule; |
| return normalBreak ? BREAK : NOBREAK; |
| } |
| |
| public byte getResolvedType(int cp) { |
| return getType(cp); |
| } |
| |
| boolean skipType(int type) { |
| return false; |
| } |
| |
| String getInfo(String s) { |
| if (s == null || s.length() == 0) return "NULL"; |
| StringBuffer result = new StringBuffer(); |
| int cp; |
| for (int i = 0; i < s.length(); i += UTF32.count16(cp)) { |
| cp = UTF32.char32At(s, i); |
| if (i > 0) result.append(", "); |
| result.append(ucd.getCodeAndName(cp)); |
| result.append(", gc=" + ucd.getCategoryID_fromIndex(ucd.getCategory(cp),SHORT)); |
| result.append(", sc=" + ucd.getScriptID_fromIndex(ucd.getScript(cp),SHORT)); |
| //result.append(", lb=" + ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp)) |
| // + "=" + ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp), LONG)); |
| } |
| return result.toString(); |
| } |
| |
| public void generateTable(PrintWriter out) { |
| String width = "width='" + (100 / (tableLimit + 1)) + "%'"; |
| out.print("<table border='1' cellspacing='0' width='100%'>"); |
| String types = ""; |
| String codes = ""; |
| for (int type = 0; type < tableLimit; ++type) { |
| String after = samples[type]; |
| if (after == null) continue; |
| |
| String h = getTypeID(after); |
| types += "<th " + width + " class='lbclass' title='" + getInfo(after) + "'>" + h + "</th>"; |
| |
| |
| //codes += "<th " + width + " title='" + getInfo(after) + "'>" + Utility.hex(after) + "</th>"; |
| } |
| |
| out.println("<tr><th " + width + "></th>" + types + "</tr>"); |
| // out.println("<tr><th " + width + "></th><th " + width + "></th>" + codes + "</tr>"); |
| |
| String[] rule = new String[1]; |
| String[] rule2 = new String[1]; |
| for (int type = 0; type < sampleLimit; ++type) { |
| if (type == tableLimit) { |
| out.println("<tr><td bgcolor='#0000FF' colSpan='" + (tableLimit + 1) + "' style='font-size: 1px'> </td></tr>"); |
| } |
| String before = samples[type]; |
| if (before == null) continue; |
| |
| String h = getTypeID(before); |
| String line = "<tr><th class='lbclass' title='" + ucd.getCodeAndName(before) + "'>" + h + "</th>"; |
| |
| for (int type2 = 0; type2 < tableLimit; ++type2) { |
| |
| String after = samples[type2]; |
| if (after == null) continue; |
| |
| String t = getTableEntry(before, after, rule); |
| String background = ""; |
| String t2 = getTableEntry(before, after, rule2); |
| if (highlightTableEntry(type, type2, t)) { |
| background = " bgcolor='#FFFF00'"; |
| } |
| if (!t.equals(t2)) { |
| if (t.equals(NOBREAK)) { |
| background = " bgcolor='#CCFFFF'"; |
| } else { |
| background = " bgcolor='#FFFF00'"; |
| } |
| } else if (t.equals(NOBREAK)) { |
| background = " bgcolor='#CCCCFF'"; |
| } |
| line += "<th title='" + rule[0] + "'" + background + " class='pairItem'>" + t + "</th>"; |
| } |
| out.println(line + "</tr>"); |
| } |
| out.println("</table>"); |
| |
| if (didSkipSamples) { |
| out.println("<p><b>Suppressed:</b> "); |
| for (int i = 0; i < skippedSamples.length; ++i) { |
| if (skippedSamples[i] > 0) { |
| String tmp = UTF16.valueOf(skippedSamples[i]); |
| out.println("<span title='" + getInfo(tmp) + "'>" + getTypeID(tmp) + "</span>"); |
| } |
| } |
| out.println("</p>"); |
| } |
| |
| // gather the data for the rules |
| if (needsFullBreakSample ) { |
| collectingRules = true; |
| isBreak(fullBreakSample(), 1); |
| collectingRules = false; |
| } |
| |
| out.println("<h3>Rules</h3>"); |
| out.println("<p>Due to the way they have been mechanically processed for generation, " + |
| "the following rules do not match the UAX rules precisely. " + |
| "In particular:</p>"+ |
| "<ol>" + |
| "<li>The rules are cast into a more regex-style.</li>"+ |
| "<li>The rules \"sot ÷\", \"÷ eot\", and \"÷ Any\" are added mechanically, and have artificial numbers.</li>"+ |
| "<li>The rules are given decimal numbers, so rules such as 11a are given a number using tenths, such as 11.1.</li>"+ |
| "<li>Where a rule has multiple parts (lines), each one is numbered using hundredths, such as 21.01) × BA, 21.02) × HY,...</li>"+ |
| "<li>Any 'treat as' or 'ignore' rules are handled as discussed in Unicode Standard Annex #29, and thus" + |
| "reflected in a transformation of the rules not visible here.</li>" + |
| "</ol>" + |
| "<p>For the original rules, see the UAX.</p>" |
| |
| ); |
| out.println("<ul style='list-style-type: none'>"); |
| for (int ii = 0; ii < ruleListCount; ++ii) { |
| out.println("<li>" + ruleList[ii].replaceAll("[$]","") + "</li>"); |
| } |
| out.println("</ul>"); |
| |
| if (extraSingleSamples.length > 0) { |
| out.println("<h3>Sample Strings</h3>"); |
| out.println("<p>" + |
| "The following samples illustrate the application of the rules. " + |
| "The blue lines indicate possible break points. " + |
| "If your browser supports titles, then positioning the mouse over each character will show its name, " + |
| "white positioning between characters shows the rule number of the rule responsible for the break-status." + |
| "</p>"); |
| out.println("<ol>"); |
| for (int ii = 0; ii < extraSingleSamples.length; ++ii) { |
| out.println("<li><font size='5'>"); |
| printLine(out, extraSingleSamples[ii], true, true); |
| out.println("</font></li>"); |
| } |
| out.println("</ol>"); |
| } |
| } |
| |
| static final String BREAK = "\u00F7"; |
| static final String NOBREAK = "\u00D7"; |
| |
| public void printLine(PrintWriter out, String source, boolean comments, boolean html) { |
| int cp; |
| StringBuffer string = new StringBuffer(); |
| StringBuffer comment = new StringBuffer("\t# "); |
| boolean hasBreak = isBreak(source, 0); |
| String status; |
| if (html) { |
| status = hasBreak ? " style='border-right: 1px solid blue'" : ""; |
| string.append("<span title='" + getRule() + "'><span" + status + "> </span> </span>"); |
| } else { |
| status = hasBreak ? BREAK : NOBREAK; |
| string.append(status); |
| } |
| comment.append(' ').append(status).append(" [").append(getRule()).append(']'); |
| |
| for (int offset = 0; offset < source.length(); offset += UTF16.getCharCount(cp)) { |
| |
| cp = UTF16.charAt(source, offset); |
| hasBreak = isBreak(source, offset + UTF16.getCharCount(cp)); |
| |
| if (html) { |
| status = hasBreak ? " style='border-right: 1px solid blue'" : ""; |
| string.append("<span title='" + |
| Utility.quoteXML(ucd.getCodeAndName(cp) + " (" + getTypeID(cp) + ")", true) |
| + "'>" |
| + Utility.quoteXML(Utility.getDisplay(cp), true) |
| + "</span>"); |
| string.append("<span title='" + getRule() + "'><span" + status + "> </span> </span>"); |
| } else { |
| if (string.length() > 0) { |
| string.append(' '); |
| comment.append(' '); |
| } |
| |
| status = hasBreak ? BREAK : NOBREAK; |
| |
| string.append(Utility.hex(cp)); |
| comment.append(ucd.getName(cp) + " (" + getTypeID(cp) + ")"); |
| string.append(' ').append(status); |
| comment.append(' ').append(status).append(" [").append(getRule()).append(']'); |
| } |
| } |
| |
| if (comments && !html) string.append(comment); |
| out.println(string); |
| if (DEBUG) System.out.println("*" + string); |
| } |
| |
| public void findSamples() { |
| |
| // what we want is a list of sample characters. In the simple case, this is just one per type. |
| // However, if there are characters that have different types (when recommended or not), then |
| // we want a type for each cross-section |
| |
| BitSet bitset = new BitSet(); |
| Map list = new TreeMap(); |
| |
| for (int i = 1; i <= 0xFFFF; ++i) { |
| if (!ucd.isAllocated(i)) continue; |
| if (0xD800 <= i && i <= 0xDFFF) continue; |
| if (DEBUG && i == 0x1100) { |
| System.out.println("debug"); |
| } |
| byte lb = getSampleType(i); |
| byte lb2 = lb; // HACK |
| if (lb == lb2 && skipType(lb)) { |
| skippedSamples[lb] = i; |
| didSkipSamples = true; |
| continue; |
| } |
| |
| int combined = (mapType(lb) << 7) + mapType(lb2); |
| if (combined < 0) { |
| throw new IllegalArgumentException("should never happen"); |
| } |
| if (!bitset.get(combined)) { |
| bitset.set(combined); |
| list.put(new Integer(combined), UTF16.valueOf(i)); |
| } |
| /* |
| // if the sample slot is full OR |
| if (samples[lb] == null) { |
| samples[lb] = UTF16.valueOf(i); |
| if (sampleLimit <= lb) sampleLimit = lb + 1; |
| // byte lb2 = getType(i, true); |
| // if (lb2 != lb) bs.set(lb); |
| } |
| */ |
| } |
| |
| Iterator it = list.keySet().iterator(); |
| while (it.hasNext()) { |
| String sample = (String)list.get(it.next()); |
| samples[sampleLimit++] = sample; |
| if (DEBUG) System.out.println(getTypeID(sample) + ":\t" + ucd.getCodeAndName(sample)); |
| } |
| |
| tableLimit = sampleLimit; |
| |
| // now add values that are different |
| /* |
| |
| for (int i = 1; i <= 0x10FFFF; ++i) { |
| if (!ucd.isAllocated(i)) continue; |
| if (0xD800 <= i && i <= 0xDFFF) continue; |
| byte lb = getType(i); |
| byte lb2 = getType(i, true); |
| if (lb == lb2) continue; |
| // pick some different ones |
| if (!bs.get(lb)) { |
| samples[sampleLimit++] = UTF16.valueOf(i); |
| bs.set(lb); |
| } |
| if (!bs2.get(lb2)) { |
| samples[sampleLimit++] = UTF16.valueOf(i); |
| bs.set(lb2); |
| } |
| } |
| */ |
| |
| if (extraSamples.length > 0) { |
| System.arraycopy(extraSamples, 0, samples, sampleLimit, extraSamples.length); |
| sampleLimit += extraSamples.length; |
| } |
| } |
| |
| public int findLastNon(String source, int offset, byte notLBType) { |
| int cp; |
| for (int i = offset-1; i >= 0; i -= UTF16.getCharCount(cp)) { |
| cp = UTF16.charAt(source, i); |
| byte f = getResolvedType(cp); |
| if (f != notLBType) return i; |
| } |
| return -1; |
| } |
| |
| public static UnicodeSet getSet(UCD ucd, int prop, byte propValue) { |
| return UnifiedBinaryProperty.make(prop | propValue, ucd).getSet(); |
| } |
| |
| static public class Context { |
| public int cpBefore2, cpBefore, cpAfter, cpAfter2; |
| public byte tBefore2, tBefore, tAfter, tAfter2; |
| public String toString() { |
| return "[" |
| + Utility.hex(cpBefore2) + "(" + tBefore2 + "), " |
| + Utility.hex(cpBefore) + "(" + tBefore + "), " |
| + Utility.hex(cpAfter) + "(" + tAfter + "), " |
| + Utility.hex(cpAfter2) + "(" + tAfter2 + ")]"; |
| } |
| } |
| |
| public void getGraphemeBases(MyBreakIterator graphemeIterator, String source, int offset, int ignoreType, Context context) { |
| context.cpBefore2 = context.cpBefore = context.cpAfter = context.cpAfter2 = -1; |
| context.tBefore2 = context.tBefore = context.tAfter = context.tAfter2 = -1; |
| //if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(source) + "; " + offset + "; " + ignoreType); |
| |
| //MyBreakIterator graphemeIterator = new MyBreakIterator(new GenerateGraphemeBreakTest(ucd)); |
| |
| graphemeIterator.set(source, offset); |
| while (true) { |
| int cp = graphemeIterator.previousBase(); |
| if (cp == -1) break; |
| byte t = getResolvedType(cp); |
| if (t == ignoreType) continue; |
| |
| if (context.cpBefore == -1) { |
| context.cpBefore = cp; |
| context.tBefore = t; |
| } else { |
| context.cpBefore2 = cp; |
| context.tBefore2 = t; |
| break; |
| } |
| } |
| graphemeIterator.set(source, offset); |
| while (true) { |
| int cp = graphemeIterator.nextBase(); |
| if (cp == -1) break; |
| byte t = getResolvedType(cp); |
| if (t == ignoreType) continue; |
| |
| if (context.cpAfter == -1) { |
| context.cpAfter = cp; |
| context.tAfter = t; |
| } else { |
| context.cpAfter2 = cp; |
| context.tAfter2 = t; |
| break; |
| } |
| } |
| } |
| |
| |
| //============================================== |
| |
| static class XGenerateBreakTest extends GenerateBreakTest { |
| Segmenter seg; |
| String sample; |
| { |
| needsFullBreakSample = false; |
| } |
| |
| public XGenerateBreakTest(UCD ucd, Segmenter.Builder segBuilder, String sample, String filename, String[] extraSamples, String[] extraSingleSamples) { |
| super(ucd); |
| this.seg = segBuilder.make(); |
| this.sample = sample; |
| List rules = segBuilder.getRules(); |
| collectingRules = true; |
| for (Iterator it = rules.iterator(); it.hasNext();) { |
| String rule = (String)it.next(); |
| setRule(rule); |
| } |
| collectingRules = false; |
| map.add("Other", new UnicodeSet(0,0x10FFFF)); |
| UnicodeMap segSamples = seg.getSamples(); |
| Collection x = segSamples.getAvailableValues(); |
| for (Iterator it = x.iterator(); it.hasNext();) { |
| String label = (String)it.next(); |
| map.add(label, segSamples.getSet(label), true, false); |
| } |
| this.fileName = filename; |
| sampleMap = map; |
| this.extraSamples = extraSamples; |
| this.extraSingleSamples = extraSingleSamples; |
| } |
| |
| public boolean isBreak(String source, int offset) { |
| boolean result = seg.breaksAt(source, offset); |
| setRule(String.valueOf(seg.getBreakRule())); |
| return result; |
| } |
| |
| public String fullBreakSample() { |
| return sample; |
| } |
| |
| // stuff that subclasses need to override |
| public String getTypeID(int cp) { |
| return map.getLabel(cp); |
| } |
| |
| // stuff that subclasses need to override |
| public byte getType(int cp) { |
| return (byte) map.getIndex(cp); |
| } |
| } |
| |
| static class GenerateGraphemeBreakTest extends XGenerateBreakTest { |
| public GenerateGraphemeBreakTest(UCD ucd) { |
| super(ucd, Segmenter.make(ToolUnicodePropertySource.make(ucd.getVersion()),"GraphemeClusterBreak"), "aa", "Grapheme", |
| new String[]{}, new String[]{}); |
| } |
| } |
| |
| static class GenerateLineBreakTest extends XGenerateBreakTest { |
| public GenerateLineBreakTest(UCD ucd) { |
| super(ucd, Segmenter.make(ToolUnicodePropertySource.make(ucd.getVersion()),"LineBreak"), "aa", "Line", |
| new String[]{}, new String[] { |
| "can't", "can\u2019t", "ab\u00ADby", |
| "-3", |
| "e.g.", |
| "\u4e00.\u4e00.", |
| "a b", |
| "a \u200bb", |
| "a \u0308b", |
| "1\u0308b(a)-(b)", |
| }); |
| } |
| } |
| |
| static class GenerateSentenceBreakTest extends XGenerateBreakTest { |
| public GenerateSentenceBreakTest(UCD ucd) { |
| super(ucd, Segmenter.make(ToolUnicodePropertySource.make(ucd.getVersion()),"SentenceBreak"), "aa", "Sentence", |
| new String[]{}, |
| getExtraSamples()); |
| } |
| static String[] getExtraSamples() { |
| GenerateBreakTest grapheme = new GenerateGraphemeBreakTest(Default.ucd()); |
| String[] extraSingleSamples = new String[] { |
| "(\"Go.\") (He did.)", |
| "(\u201CGo?\u201D) (He did.)", |
| "U.S.A\u0300. is", |
| "U.S.A\u0300? He", |
| "U.S.A\u0300.", |
| "3.4", |
| "c.d", |
| "etc.)\u2019 \u2018(the", |
| "etc.)\u2019 \u2018(The", |
| "the resp. leaders are", |
| "\u5B57.\u5B57", |
| "etc.\u5B83", |
| "etc.\u3002", |
| "\u5B57\u3002\u5B83", |
| }; |
| String[] temp = new String [extraSingleSamples.length * 2]; |
| System.arraycopy(extraSingleSamples, 0, temp, 0, extraSingleSamples.length); |
| for (int i = 0; i < extraSingleSamples.length; ++i) { |
| temp[i+extraSingleSamples.length] = insertEverywhere(extraSingleSamples[i], "\u2060", grapheme); |
| } |
| extraSingleSamples = temp; |
| return extraSingleSamples; |
| } |
| } |
| |
| static class GenerateWordBreakTest extends XGenerateBreakTest { |
| public GenerateWordBreakTest(UCD ucd) { |
| super(ucd, Segmenter.make(ToolUnicodePropertySource.make(ucd.getVersion()),"WordBreak"), "aa", "Word", |
| new String[] { |
| /*"\uFF70", "\uFF65", "\u30FD", */ "a\u2060", "a:", "a'", "a'\u2060", "a,", "1:", "1'", "1,", "1.\u2060" |
| }, |
| |
| |
| getExtraSamples()); |
| } |
| static String[] getExtraSamples() { |
| GenerateBreakTest grapheme = new GenerateGraphemeBreakTest(Default.ucd()); |
| String [] temp = {"can't", "can\u2019t", "ab\u00ADby", "a$-34,567.14%b", "3a" }; |
| String[] extraSingleSamples = new String [temp.length * 2]; |
| System.arraycopy(temp, 0, extraSingleSamples, 0, temp.length); |
| for (int i = 0; i < temp.length; ++i) { |
| extraSingleSamples[i+temp.length] = insertEverywhere(temp[i], "\u2060", grapheme); |
| } |
| |
| return extraSingleSamples; |
| } |
| } |
| |
| static class OLDGenerateGraphemeBreakTest extends GenerateBreakTest { |
| |
| OLDGenerateGraphemeBreakTest(UCD ucd) { |
| super(ucd); |
| fileName = "Grapheme"; |
| sampleMap = map; |
| } |
| |
| Object foo = prop = unicodePropertySource.getProperty("Grapheme_Cluster_Break"); |
| |
| final int |
| CR = addToMap("CR"), |
| LF = addToMap("LF"), |
| Control = addToMap("Control"), |
| Extend = addToMap("Extend"), |
| L = addToMap("L"), |
| V = addToMap("V"), |
| T = addToMap("T"), |
| LV = addToMap("LV"), |
| LVT = addToMap("LVT"), |
| Other = addToMapLast("Other"); |
| |
| // stuff that subclasses need to override |
| public String getTypeID(int cp) { |
| return map.getLabel(cp); |
| } |
| |
| // stuff that subclasses need to override |
| public byte getType(int cp) { |
| return (byte) map.getIndex(cp); |
| } |
| |
| public String fullBreakSample() { |
| return "aa"; |
| } |
| |
| public boolean isBreak(String source, int offset) { |
| |
| setRule("1: sot ÷"); |
| if (offset < 0 || offset > source.length()) return false; |
| if (offset == 0) return true; |
| |
| setRule("2: ÷ eot"); |
| if (offset == source.length()) return true; |
| |
| // UTF-16: never break in the middle of a code point |
| if (!onCodepointBoundary(source, offset)) return false; |
| |
| // now get the character before and after, and their types |
| |
| |
| int cpBefore = UTF16.charAt(source, offset-1); |
| int cpAfter = UTF16.charAt(source, offset); |
| |
| byte before = getResolvedType(cpBefore); |
| byte after = getResolvedType(cpAfter); |
| |
| setRule("3: CR × LF"); |
| if (before == CR && after == LF) return false; |
| |
| setRule("4: ( Control | CR | LF ) ÷"); |
| if (before == CR || before == LF || before == Control) return true; |
| |
| setRule("5: ÷ ( Control | CR | LF )"); |
| if (after == Control || after == LF || after == CR) return true; |
| |
| setRule("6: L × ( L | V | LV | LVT )"); |
| if (before == L && (after == L || after == V || after == LV || after == LVT)) return false; |
| |
| setRule("7: ( LV | V ) × ( V | T )"); |
| if ((before == LV || before == V) && (after == V || after == T)) return false; |
| |
| setRule("8: ( LVT | T ) × T"); |
| if ((before == LVT || before == T) && (after == T)) return false; |
| |
| setRule("9: × Extend"); |
| if (after == Extend) return false; |
| |
| // Otherwise break after all characters. |
| setRule("10: Any ÷ Any"); |
| return true; |
| |
| } |
| |
| } |
| |
| //============================================== |
| |
| static class XGenerateWordBreakTest extends GenerateBreakTest { |
| |
| GenerateGraphemeBreakTest grapheme; |
| MyBreakIterator breaker; |
| Context context = new Context(); |
| |
| XGenerateWordBreakTest(UCD ucd) { |
| super(ucd); |
| grapheme = new GenerateGraphemeBreakTest(ucd); |
| breaker = new MyBreakIterator(grapheme); |
| fileName = "Word"; |
| sampleMap = map; |
| extraSamples = new String[] { |
| /*"\uFF70", "\uFF65", "\u30FD", */ "a\u2060", "a:", "a'", "a'\u2060", "a,", "1:", "1'", "1,", "1.\u2060" |
| }; |
| |
| String [] temp = {"can't", "can\u2019t", "ab\u00ADby", "a$-34,567.14%b", "3a" }; |
| extraSingleSamples = new String [temp.length * 2]; |
| System.arraycopy(temp, 0, extraSingleSamples, 0, temp.length); |
| for (int i = 0; i < temp.length; ++i) { |
| extraSingleSamples[i+temp.length] = insertEverywhere(temp[i], "\u2060", grapheme); |
| } |
| |
| if (false) Utility.showSetDifferences("Katakana", map.getSetFromIndex(Katakana), |
| "Script=Katakana", getSet(ucd, SCRIPT, KATAKANA_SCRIPT), false, ucd); |
| |
| } |
| |
| Object foo = prop = unicodePropertySource.getProperty("Word_Break"); |
| |
| //static String LENGTH = "[\u30FC\uFF70]"; |
| //static String HALFWIDTH_KATAKANA = "[\uFF66-\uFF9F]"; |
| //static String KATAKANA_ITERATION = "[\u30FD\u30FE]"; |
| //static String HIRAGANA_ITERATION = "[\u309D\u309E]"; |
| |
| final int |
| Format = addToMap("Format"), |
| Katakana = addToMap("Katakana"), |
| ALetter = addToMap("ALetter"), |
| MidLetter = addToMap("MidLetter"), |
| //MidNumLet = addToMap("MidNumLet"), |
| MidNum = addToMap("MidNum"), |
| Numeric = addToMap("Numeric"), |
| ExtendNumLet = addToMap("ExtendNumLet"), |
| Other = addToMapLast("Other"); |
| |
| // stuff that subclasses need to override |
| public String getTypeID(int cp) { |
| return map.getLabel(cp); |
| } |
| |
| // stuff that subclasses need to override |
| public byte getType(int cp) { |
| return (byte) map.getIndex(cp); |
| } |
| |
| public String fullBreakSample() { |
| return " a"; |
| } |
| |
| public int genTestItems(String before, String after, String[] results) { |
| results[0] = before + after; |
| results[1] = 'a' + before + "\u0301\u0308" + after + "\u0301\u0308" + 'a'; |
| results[2] = 'a' + before + "\u0301\u0308" + samples[MidLetter] + after + "\u0301\u0308" + 'a'; |
| results[3] = 'a' + before + "\u0301\u0308" + samples[MidNum] + after + "\u0301\u0308" + 'a'; |
| return 3; |
| } |
| |
| public boolean isBreak(String source, int offset) { |
| |
| setRule("1: sot ÷"); |
| if (offset < 0 || offset > source.length()) return false; |
| |
| if (offset == 0) return true; |
| |
| setRule("2: ÷ eot"); |
| if (offset == source.length()) return true; |
| |
| // Treat a grapheme cluster as if it were a single character: |
| // the first base character, if there is one; otherwise the first character. |
| |
| setRule("3: GC -> FC"); |
| if (!grapheme.isBreak( source, offset)) return false; |
| |
| setRule("4: X Format* -> X"); |
| byte afterChar = getResolvedType(source.charAt(offset)); |
| if (afterChar == Format) return false; |
| |
| // now get the base character before and after, and their types |
| |
| getGraphemeBases(breaker, source, offset, Format, context); |
| |
| byte before = context.tBefore; |
| byte after = context.tAfter; |
| byte before2 = context.tBefore2; |
| byte after2 = context.tAfter2; |
| |
| //Don't break between most letters |
| |
| setRule("5: ALetter × ALetter"); |
| if (before == ALetter && after == ALetter) return false; |
| |
| // Don’t break letters across certain punctuation |
| |
| setRule("6: ALetter × MidLetter ALetter"); |
| if (before == ALetter && after == MidLetter && after2 == ALetter) return false; |
| |
| setRule("7: ALetter (MidLetter | MidNumLet) × ALetter"); |
| if (before2 == ALetter && before == MidLetter && after == ALetter) return false; |
| |
| // Don’t break within sequences of digits, or digits adjacent to letters. |
| |
| setRule("8: Numeric × Numeric"); |
| if (before == Numeric && after == Numeric) return false; |
| |
| setRule("9: ALetter × Numeric"); |
| if (before == ALetter && after == Numeric) return false; |
| |
| setRule("10: Numeric × ALetter"); |
| if (before == Numeric && after == ALetter) return false; |
| |
| |
| // Don’t break within sequences like: '-3.2' |
| setRule("11: Numeric (MidNum | MidNumLet) × Numeric"); |
| if (before2 == Numeric && before == MidNum && after == Numeric) return false; |
| |
| setRule("12: Numeric × (MidNum | MidNumLet) Numeric"); |
| if (before == Numeric && after == MidNum && after2 == Numeric) return false; |
| |
| // Don't break between Katakana |
| |
| setRule("13: Katakana × Katakana"); |
| if (before == Katakana && after == Katakana) return false; |
| |
| // Do not break from extenders |
| setRule("13a: (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet"); |
| if ((before == ALetter || before == Numeric || before == Katakana || before == ExtendNumLet) && after == ExtendNumLet) return false; |
| |
| setRule("13b: ExtendNumLet × (ALetter | Numeric | Katakana)"); |
| if (before == ExtendNumLet && (after == ALetter || after == Numeric || after == Katakana)) return false; |
| |
| // Otherwise break always. |
| setRule("14: Any ÷ Any"); |
| return true; |
| |
| } |
| |
| } |
| |
| // ======================================== |
| |
| static class XGenerateLineBreakTest extends GenerateBreakTest { |
| |
| GenerateGraphemeBreakTest grapheme; |
| MyBreakIterator breaker; |
| Context context = new Context(); |
| |
| XGenerateLineBreakTest(UCD ucd) { |
| super(ucd); |
| grapheme = new GenerateGraphemeBreakTest(ucd); |
| breaker = new MyBreakIterator(grapheme); |
| |
| sampleMap = map; |
| fileName = "Line"; |
| extraSingleSamples = new String[] {"can't", "can\u2019t", "ab\u00ADby", |
| "-3", |
| "e.g.", |
| "\u4e00.\u4e00.", |
| "a b", |
| "a \u200bb", |
| "a \u0308b", |
| "1\u0308b(a)-(b)", |
| }; |
| } |
| |
| // all the other items are supplied in UCD_TYPES |
| |
| /*static byte LB_L = LB_LIMIT + hL, LB_V = LB_LIMIT + hV, LB_T = LB_LIMIT + hT, |
| LB_LV = LB_LIMIT + hLV, LB_LVT = LB_LIMIT + hLVT, LB_SUP = LB_LIMIT + hLIMIT, |
| LB2_LIMIT = (byte)(LB_SUP + 1); |
| */ |
| |
| /* |
| private byte[] AsmusOrderToMyOrder = { |
| LB_OP, LB_CL, LB_QU, LB_GL, LB_NS, LB_EX, LB_SY, LB_IS, LB_PR, LB_PO, |
| LB_NU, LB_AL, LB_ID, LB_IN, LB_HY, LB_BA, LB_BB, LB_B2, LB_ZW, LB_CM, |
| // missing from Pair Table |
| LB_SP, LB_BK, LB_CR, LB_LF, |
| // resolved types below |
| LB_CB, LB_AI, LB_SA, LB_SG, LB_XX, |
| // 3 JAMO CLASSES, plus supplementary |
| LB_L, LB_V, LB_T, LB_LV, LB_LVT, LB_SUP |
| }; |
| |
| private byte[] MyOrderToAsmusOrder = new byte[AsmusOrderToMyOrder.length]; |
| { |
| for (byte i = 0; i < AsmusOrderToMyOrder.length; ++i) { |
| MyOrderToAsmusOrder[AsmusOrderToMyOrder[i]] = i; |
| } |
| */ |
| |
| { |
| //System.out.println("Adding Linebreak"); |
| for (int i = 0; i <= 0x10FFFF; ++i) { |
| map.put(i, ucd.getLineBreak(i)); |
| } |
| for (int i = 0; i < LB_LIMIT; ++i) { |
| map.setLabel(i, ucd.getLineBreakID_fromIndex((byte)i, SHORT)); |
| } |
| //System.out.println(map.getSetFromIndex(LB_CL)); |
| //System.out.println("Done adding Linebreak"); |
| } |
| |
| public int mapType(int input) { |
| int old = input; |
| switch (input) { |
| case LB_BA: input = 16; break; |
| case LB_BB: input = 17; break; |
| case LB_B2: input = 18; break; |
| case LB_ZW: input = 19; break; |
| case LB_CM: input = 20; break; |
| case LB_WJ: input = 21; break; |
| |
| case LB_SP: input = 22; break; |
| case LB_BK: input = 23; break; |
| case LB_NL: input = 24; break; |
| case LB_CR: input = 25; break; |
| case LB_LF: input = 26; break; |
| |
| case LB_CB: input = 27; break; |
| case LB_SA: input = 28; break; |
| case LB_AI: input = 29; break; |
| case LB_SG: input = 30; break; |
| } |
| //if (old != input) System.out.println(old + " => " + input); |
| return input; |
| } |
| |
| |
| public void sampleDescription(PrintWriter out) { |
| out.println("# Samples:"); |
| out.println("# The test currently takes all pairs of linebreak types*,"); |
| out.println("# picks a sample for each type, and generates three strings: "); |
| out.println("#\t- the pair alone"); |
| out.println("#\t- the pair alone with an imbeded space"); |
| out.println("#\t- the pair alone with embedded combining marks"); |
| out.println("# The sample for each type is simply the first code point (above NULL)"); |
| out.println("# with that property."); |
| out.println("# * Note:"); |
| out.println("#\t- SG is omitted"); |
| out.println("#\t- 3 different Jamo characters and a supplementary character are added"); |
| out.println("#\t The syllable types for the Jamo (L, V, T) are displayed in comments"); |
| out.println("#\t instead of the linebreak property"); |
| out.println("#"); |
| } |
| |
| // stuff that subclasses need to override |
| public int genTestItems(String before, String after, String[] results) { |
| results[0] = before + after; |
| results[1] = before + " " + after; |
| results[2] = before + "\u0301\u0308" + after; |
| return 3; |
| } |
| |
| // stuff that subclasses need to override |
| boolean skipType(int type) { |
| return type == LB_AI || type == LB_SA || type == LB_SG || type == LB_XX |
| || type == LB_CB || type == LB_CR || type == LB_BK || type == LB_LF |
| || type == LB_NL || type == LB_SP; |
| } |
| |
| // stuff that subclasses need to override |
| public String getTypeID(int cp) { |
| /* |
| byte result = getType(cp); |
| if (result == LB_SUP) return "SUP"; |
| if (result >= LB_LIMIT) return hNames[result - LB_LIMIT]; |
| */ |
| // return ucd.getLineBreakID_fromIndex(cp); // AsmusOrderToMyOrder[result]); |
| return ucd.getLineBreakID(cp); // AsmusOrderToMyOrder[result]); |
| } |
| |
| public String fullBreakSample() { |
| return ")a"; |
| } |
| |
| // stuff that subclasses need to override |
| public byte getType(int cp) { |
| /*if (cp > 0xFFFF) return LB_SUP; |
| byte result = getHangulType(cp); |
| if (result != hNot) return (byte)(result + LB_LIMIT); |
| */ |
| // return MyOrderToAsmusOrder[ucd.getLineBreak(cp)]; |
| return ucd.getLineBreak(cp); |
| } |
| |
| public String getTableEntry(String before, String after, String[] ruleOut) { |
| String t = "_"; // break |
| boolean spaceBreak = isBreak(before + " " + after, before.length()+1); |
| String spaceRule = getRule(); |
| |
| boolean spaceBreak2 = isBreak(before + " " + after, before.length()); |
| String spaceRule2 = getRule(); |
| |
| boolean normalBreak = isBreak(before + after, before.length()); |
| String normalRule = getRule(); |
| |
| ruleOut[0] = normalRule; |
| if (!normalBreak) { |
| if (!spaceBreak && !spaceBreak2) { |
| t = "^"; // don't break, even with intervening spaces |
| } else { |
| t = "%"; // don't break, but break with intervening spaces |
| } |
| if (!spaceRule2.equals(normalRule)) { |
| ruleOut[0] += " [" + spaceRule2 + "]"; |
| } |
| if (!spaceRule.equals(normalRule) && !spaceRule.equals(spaceRule2)) { |
| ruleOut[0] += " {" + spaceRule + "}"; |
| } |
| } |
| return t; |
| } |
| |
| public boolean highlightTableEntry(int x, int y, String s) { |
| return false; |
| /* |
| try { |
| return !oldLineBreak[x][y].equals(s); |
| } catch (Exception e) {} |
| return true; |
| */ |
| } |
| |
| /* |
| String[][] oldLineBreak = { |
| {"^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "%"}, |
| {"_", "^", "%", "%", "^", "^", "^", "^", " ", "%", "_", "_", "_", "_", "%", "%", "_", "_", "^", "%"}, |
| {"^", "^", "%", "%", "%", "^", "^", "^", "%", "%", "%", "%", "%", "%", "%", "%", "%", "%", "^", "%"}, |
| {"%", "^", "%", "%", "%", "^", "^", "^", "%", "%", "%", "%", "%", "%", "%", "%", "%", "%", "^", "%"}, |
| {"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "_", "_", "_", "_", "%", "%", "_", "_", "^", "%"}, |
| {"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "_", "_", "_", "_", "%", "%", "_", "_", "^", "%"}, |
| {"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "%", "_", "_", "_", "%", "%", "_", "_", "^", "%"}, |
| {"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "%", "_", "_", "_", "%", "%", "_", "_", "^", "%"}, |
| {"%", "^", "%", "%", "%", "^", "^", "^", "_", "_", "%", "%", "%", "_", "%", "%", "_", "_", "^", "%"}, |
| {"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "_", "_", "_", "_", "%", "%", "_", "_", "^", "%"}, |
| {"_", "^", "%", "%", "%", "^", "^", "^", "_", "%", "%", "%", "_", "%", "%", "%", "_", "_", "^", "%"}, |
| {"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "%", "%", "_", "%", "%", "%", "_", "_", "^", "%"}, |
| {"_", "^", "%", "%", "%", "^", "^", "^", "_", "%", "_", "_", "_", "%", "%", "%", "_", "_", "^", "%"}, |
| {"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "_", "_", "_", "%", "%", "%", "_", "_", "^", "%"}, |
| {"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "_", "_", "_", "_", "%", "%", "_", "_", "^", "%"}, |
| {"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "_", "_", "_", "_", "%", "%", "_", "_", "^", "%"}, |
| {"%", "^", "%", "%", "%", "^", "^", "^", "%", "%", "%", "%", "%", "%", "%", "%", "%", "%", "^", "%"}, |
| {"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "_", "_", "_", "_", "%", "%", "_", "^", "^", "%"}, |
| {"_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "^", "%"}, |
| {"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "%", "%", "_", "%", "%", "%", "_", "_", "^", "%"} |
| }; |
| */ |
| |
| public byte getResolvedType (int cp) { |
| // LB 1 Assign a line break category to each character of the input. |
| // Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm. |
| byte result = getType(cp); |
| switch (result) { |
| case LB_AI: result = LB_AI; break; |
| // case LB_CB: result = LB_ID; break; |
| case LB_SA: result = LB_AL; break; |
| // case LB_SG: result = LB_XX; break; Surrogates; will never occur |
| case LB_XX: result = LB_AL; break; |
| } |
| /* |
| if (recommended) { |
| if (getHangulType(cp) != hNot) { |
| result = LB_ID; |
| } |
| } |
| */ |
| |
| return result; |
| } |
| |
| public byte getSampleType (int cp) { |
| if (ucd.getHangulSyllableType(cp) != NA) return LB_XX; |
| return getType(cp); |
| } |
| |
| |
| // find out whether there is a break at offset |
| // WARNING: as a side effect, sets "rule" |
| |
| public boolean isBreak(String source, int offset) { |
| |
| // LB 1 Assign a line break category to each character of the input. |
| // Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm. |
| // this is taken care of in the getResolvedType function |
| |
| // LB 2a Never break at the start of text |
| |
| setRule("2a: × sot"); |
| if (offset <= 0) return false; |
| |
| // LB 2b Always break at the end of text |
| |
| setRule("2b: ! eot"); |
| if (offset >= source.length()) return true; |
| |
| |
| // UTF-16: never break in the middle of a code point |
| |
| // now get the base character before and after, and their types |
| |
| getGraphemeBases(breaker, source, offset, -1, context); |
| |
| byte before = context.tBefore; |
| byte after = context.tAfter; |
| byte before2 = context.tBefore2; |
| byte after2 = context.tAfter2; |
| |
| |
| //if (!onCodepointBoundary(source, offset)) return false; |
| |
| |
| // now get the character before and after, and their types |
| |
| |
| //int cpBefore = UTF16.charAt(source, offset-1); |
| //int cpAfter = UTF16.charAt(source, offset); |
| |
| //byte before = getResolvedType(cpBefore); |
| //byte after = getResolvedType(cpAfter); |
| |
| |
| setRule("3a: CR × LF ; ( BK | CR | LF | NL ) !"); |
| |
| // Always break after hard line breaks (but never between CR and LF). |
| // CR ^ LF |
| if (before == LB_CR && after == LB_LF) return false; |
| if (before == LB_BK || before == LB_LF || before == LB_CR) return true; |
| |
| //LB 3b Don’t break before hard line breaks. |
| setRule("3b: × ( BK | CR | LF )"); |
| if (after == LB_BK || after == LB_LF || after == LB_CR) return false; |
| |
| // LB 4 Don’t break before spaces or zero-width space. |
| setRule("4: × ( SP | ZW )"); |
| if (after == LB_SP || after == LB_ZW) return false; |
| |
| // LB 5 Break after zero-width space. |
| setRule("5: ZW ÷"); |
| if (before == LB_ZW) return true; |
| |
| // LB 6 Don’t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos. |
| setRule("6: DGC -> FC"); |
| if (!grapheme.isBreak( source, offset)) return false; |
| |
| /* |
| if (before == LB_L && (after == LB_L || after == LB_V || after == LB_LV || after == LB_LVT)) return false; |
| if ((before == LB_LV || before == LB_V) && (after == LB_V || after == LB_T)) return false; |
| if ((before == LB_LVT || before == LB_T) && (after == LB_T)) return false; |
| */ |
| |
| byte backBase = -1; |
| boolean setBase = false; |
| if (before == LB_CM) { |
| setBase = true; |
| int backOffset = findLastNon(source, offset, LB_CM); |
| if (backOffset >= 0) { |
| backBase = getResolvedType(UTF16.charAt(source, backOffset)); |
| } |
| } |
| |
| |
| // LB 7 In all of the following rules, if a space is the base character for a combining mark, |
| // the space is changed to type ID. In other words, break before SP CM* in the same cases as |
| // one would break before an ID. |
| setRule("7: SP CM* -> ID"); |
| if (setBase && backBase == LB_SP) before = LB_ID; |
| if (after == LB_SP && after2 == LB_CM) after = LB_ID; |
| |
| setRule("7a: X CM* -> X"); |
| if (after == LB_CM) return false; |
| if (setBase && backBase != -1) before = LB_ID; |
| |
| setRule("7b: CM -> AL"); |
| if (setBase && backBase == -1) before = LB_AL; |
| |
| |
| // LB 8 Don’t break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. |
| // × CL, × EX, × IS, × SY |
| setRule("8: × ( CL | EX | IS | SY )"); |
| if (after == LB_CL || after == LB_EX || after == LB_SY | after == LB_IS) return false; |
| |
| |
| // find the last non-space character; we will need it |
| byte lastNonSpace = before; |
| if (lastNonSpace == LB_SP) { |
| int backOffset = findLastNon(source, offset, LB_SP); |
| if (backOffset >= 0) { |
| lastNonSpace = getResolvedType(UTF16.charAt(source, backOffset)); |
| } |
| } |
| |
| // LB 9 Don’t break after ‘[’, even after spaces. |
| // OP SP* × |
| setRule("9: OP SP* ×"); |
| if (lastNonSpace == LB_OP) return false; |
| |
| // LB 10 Don’t break within ‘�?[’, , even with intervening spaces. |
| // QU SP* × OP |
| setRule("10: QU SP* × OP"); |
| if (lastNonSpace == LB_QU && after == LB_OP) return false; |
| |
| // LB 11 Don’t break within ‘]h’, even with intervening spaces. |
| // CL SP* × NS |
| setRule("11: CL SP* × NS"); |
| if (lastNonSpace == LB_CL && after == LB_NS) return false; |
| |
| // LB 11a Don’t break within ‘——’, even with intervening spaces. |
| // B2 × B2 |
| setRule("11a: B2 × B2"); |
| if (lastNonSpace == LB_B2 && after == LB_B2) return false; |
| |
| |
| // LB 13 Don’t break before or after NBSP or WORD JOINER |
| // × GL |
| // GL × |
| |
| setRule("11b: × WJ ; WJ ×"); |
| if (after == LB_WJ || before == LB_WJ) return false; |
| |
| // [Note: by this time, all of the "X" in the table are accounted for. We can safely break after spaces.] |
| |
| // LB 12 Break after spaces |
| setRule("12: SP ÷"); |
| if (before == LB_SP) return true; |
| |
| // LB 13 Don’t break before or after NBSP or WORD JOINER |
| setRule("13: × GL ; GL ×"); |
| if (after == LB_GL || before == LB_GL) return false; |
| |
| // LB 14 Don’t break before or after ‘�?’ |
| setRule("14: × QU ; QU ×"); |
| if (before == LB_QU || after == LB_QU) return false; |
| |
| // LB 14a Break before and after CB |
| setRule("14a: ÷ CB ; CB ÷"); |
| if (before == LB_CB || after == LB_CB) return true; |
| |
| // LB 15 Don’t break before hyphen-minus, other hyphens, fixed-width spaces, |
| // small kana and other non- starters, or after acute accents: |
| |
| setRule("15: × ( BA | HY | NS ) ; BB ×"); |
| if (after == LB_NS) return false; |
| if (after == LB_HY) return false; |
| if (after == LB_BA) return false; |
| if (before == LB_BB) return false; |
| |
| |
| //setRule("15a: HY × NU"); // NEW |
| //if (before == LB_HY && after == LB_NU) return false; |
| |
| // LB 16 Don’t break between two ellipses, or between letters or numbers and ellipsis: |
| // Examples: ’9...’, ‘a...’, ‘H...’ |
| setRule("16: ( AL | ID | IN | NU ) × IN"); |
| if ((before == LB_NU || before == LB_AL || before == LB_ID) && after == LB_IN) return false; |
| if (before == LB_IN && after == LB_IN) return false; |
| |
| // Don't break alphanumerics. |
| // LB 17 Don’t break within ‘a9’, ‘3a’, or ‘H%’ |
| // Numbers are of the form PR ? ( OP | HY ) ? NU (NU | IS) * CL ? PO ? |
| // Examples: $(12.35) 2,1234 (12)¢ 12.54¢ |
| // This is approximated with the following rules. (Some cases already handled above, |
| // like ‘9,’, ‘[9’.) |
| setRule("17: ID × PO ; AL × NU; NU × AL"); |
| if (before == LB_ID && after == LB_PO) return false; |
| if (before == LB_AL && after == LB_NU) return false; |
| if (before == LB_NU && after == LB_AL) return false; |
| |
| // LB 18 Don’t break between the following pairs of classes. |
| // CL × PO |
| // HY × NU |
| // IS × NU |
| // NU × NU |
| // NU × PO |
| // PR × AL |
| // PR × HY |
| // PR × ID |
| // PR × NU |
| // PR × OP |
| // SY × NU |
| // Example pairs: ‘$9’, ‘$[’, ‘$-‘, ‘-9’, ‘/9’, ‘99’, ‘,9’, ‘9%’ ‘]%’ |
| |
| setRule("18: CL × PO ; NU × PO ; ( IS | NU | HY | PR | SY ) × NU ; PR × ( AL | HY | ID | OP )"); |
| if (before == LB_CL && after == LB_PO) return false; |
| if (before == LB_IS && after == LB_NU) return false; |
| if (before == LB_NU && after == LB_NU) return false; |
| if (before == LB_NU && after == LB_PO) return false; |
| |
| if (before == LB_HY && after == LB_NU) return false; |
| |
| if (before == LB_PR && after == LB_AL) return false; |
| if (before == LB_PR && after == LB_HY) return false; |
| if (before == LB_PR && after == LB_ID) return false; |
| if (before == LB_PR && after == LB_NU) return false; |
| if (before == LB_PR && after == LB_OP) return false; |
| |
| if (before == LB_SY && after == LB_NU) return false; |
| |
| // LB 15b Break after hyphen-minus, and before acute accents: |
| setRule("18b: HY ÷ ; ÷ BB"); |
| if (before == LB_HY) return true; |
| if (after == LB_BB) return true; |
| |
| // LB 19 Don’t break between alphabetics (“at�?) |
| // AL × AL |
| |
| setRule("19: AL × AL"); |
| if (before == LB_AL && after == LB_AL) return false; |
| |
| // LB 20 Break everywhere else |
| // ALL ÷ |
| // ÷ ALL |
| |
| if (ucd.getCompositeVersion() > 0x040000) { |
| setRule("19b: IS × AL"); |
| if (before == LB_IS && after == LB_AL) return false; |
| } |
| |
| // LB 20 Break everywhere else |
| // ALL ÷ |
| // ÷ ALL |
| |
| setRule("20: ALL ÷ ; ÷ ALL"); |
| return true; |
| } |
| } |
| |
| //============================================== |
| |
| static class XGenerateSentenceBreakTest extends GenerateBreakTest { |
| |
| GenerateGraphemeBreakTest grapheme; |
| MyBreakIterator breaker; |
| |
| XGenerateSentenceBreakTest(UCD ucd) { |
| super(ucd); |
| grapheme = new GenerateGraphemeBreakTest(ucd); |
| breaker = new MyBreakIterator(grapheme); |
| |
| fileName = "Sentence"; |
| extraSamples = new String[] { |
| }; |
| |
| extraSingleSamples = new String[] { |
| "(\"Go.\") (He did.)", |
| "(\u201CGo?\u201D) (He did.)", |
| "U.S.A\u0300. is", |
| "U.S.A\u0300? He", |
| "U.S.A\u0300.", |
| "3.4", |
| "c.d", |
| "etc.)\u2019 \u2018(the", |
| "etc.)\u2019 \u2018(The", |
| "the resp. leaders are", |
| "\u5B57.\u5B57", |
| "etc.\u5B83", |
| "etc.\u3002", |
| "\u5B57\u3002\u5B83", |
| }; |
| String[] temp = new String [extraSingleSamples.length * 2]; |
| System.arraycopy(extraSingleSamples, 0, temp, 0, extraSingleSamples.length); |
| for (int i = 0; i < extraSingleSamples.length; ++i) { |
| temp[i+extraSingleSamples.length] = insertEverywhere(extraSingleSamples[i], "\u2060", grapheme); |
| } |
| extraSingleSamples = temp; |
| |
| } |
| |
| Object foo = prop = unicodePropertySource.getProperty("Sentence_Break"); |
| |
| final int |
| Sep = addToMap("Sep"), |
| Format = addToMap("Format"), |
| Sp = addToMap("Sp"), |
| Lower = addToMap("Lower"), |
| Upper = addToMap("Upper"), |
| OLetter = addToMap("OLetter"), |
| Numeric = addToMap("Numeric"), |
| ATerm = addToMap("ATerm"), |
| STerm = addToMap("STerm"), |
| Close = addToMap("Close"), |
| Other = addToMapLast("Other"); |
| |
| // stuff that subclasses need to override |
| public String getTypeID(int cp) { |
| return map.getLabel(cp); |
| } |
| |
| public String fullBreakSample() { |
| return "!a"; |
| } |
| |
| // stuff that subclasses need to override |
| public byte getType(int cp) { |
| return (byte) map.getIndex(cp); |
| } |
| |
| /*LB_XX = 0, LB_OP = 1, LB_CL = 2, LB_QU = 3, LB_GL = 4, LB_NS = 5, LB_EX = 6, LB_SY = 7, |
| LB_IS = 8, LB_PR = 9, LB_PO = 10, LB_NU = 11, LB_AL = 12, LB_ID = 13, LB_IN = 14, LB_HY = 15, |
| LB_CM = 16, LB_BB = 17, LB_BA = 18, LB_SP = 19, LB_BK = 20, LB_CR = 21, LB_LF = 22, LB_CB = 23, |
| LB_SA = 24, LB_AI = 25, LB_B2 = 26, LB_SG = 27, LB_ZW = 28, |
| LB_NL = 29, |
| LB_WJ = 30, |
| */ |
| /* |
| static final byte Format = 0, Sep = 1, Sp = 2, OLetter = 3, Lower = 4, Upper = 5, |
| Numeric = 6, Close = 7, ATerm = 8, Term = 9, Other = 10, |
| LIMIT = Other + 1; |
| |
| static final String[] Names = {"Format", "Sep", "Sp", "OLetter", "Lower", "Upper", "Numeric", |
| "Close", "ATerm", "Term", "Other" }; |
| |
| |
| static UnicodeSet sepSet = new UnicodeSet("[\\u000a\\u000d\\u0085\\u2029\\u2028]"); |
| static UnicodeSet atermSet = new UnicodeSet("[\\u002E]"); |
| static UnicodeSet termSet = new UnicodeSet( |
| "[\\u0021\\u003F\\u0589\\u061f\\u06d4\\u0700-\\u0702\\u0934" |
| + "\\u1362\\u1367\\u1368\\u104A\\u104B\\u166E" |
| + "\\u1803\\u1809\\u203c\\u203d" |
| + "\\u2048\\u2049\\u3002\\ufe52\\ufe57\\uff01\\uff0e\\uff1f\\uff61]"); |
| |
| static UnicodeProperty lowercaseProp = UnifiedBinaryProperty.make(DERIVED | PropLowercase); |
| static UnicodeProperty uppercaseProp = UnifiedBinaryProperty.make(DERIVED | PropUppercase); |
| |
| UnicodeSet linebreakNS = UnifiedBinaryProperty.make(LINE_BREAK | LB_NU).getSet(); |
| */ |
| |
| /* |
| // stuff that subclasses need to override |
| public String getTypeID(int cp) { |
| byte type = getType(cp); |
| return Names[type]; |
| } |
| |
| // stuff that subclasses need to override |
| public byte getType(int cp) { |
| byte cat = ucd.getCategory(cp); |
| |
| if (cat == Cf) return Format; |
| if (sepSet.contains(cp)) return Sep; |
| if (ucd.getBinaryProperty(cp, White_space)) return Sp; |
| if (linebreakNS.contains(cp)) return Numeric; |
| if (lowercaseProp.hasValue(cp)) return Lower; |
| if (uppercaseProp.hasValue(cp) || cat == Lt) return Upper; |
| if (alphabeticSet.contains(cp)) return OLetter; |
| if (atermSet.contains(cp)) return ATerm; |
| if (termSet.contains(cp)) return Term; |
| if (cat == Po || cat == Pe |
| || ucd.getLineBreak(cp) == LB_QU) return Close; |
| return Other; |
| } |
| */ |
| |
| public int genTestItems(String before, String after, String[] results) { |
| results[0] = before + after; |
| /* |
| results[1] = 'a' + before + "\u0301\u0308" + after + "\u0301\u0308" + 'a'; |
| results[2] = 'a' + before + "\u0301\u0308" + samples[MidLetter] + after + "\u0301\u0308" + 'a'; |
| results[3] = 'a' + before + "\u0301\u0308" + samples[MidNum] + after + "\u0301\u0308" + 'a'; |
| */ |
| return 1; |
| } |
| |
| static Context context = new Context(); |
| |
| public boolean isBreak(String source, int offset) { |
| |
| // Break at the start and end of text. |
| setRule("1: sot ÷"); |
| if (offset < 0 || offset > source.length()) return false; |
| |
| if (offset == 0) return true; |
| |
| setRule("2: ÷ eot"); |
| if (offset == source.length()) return true; |
| |
| setRule("3: Sep ÷"); |
| byte beforeChar = getResolvedType(source.charAt(offset-1)); |
| if (beforeChar == Sep) return true; |
| |
| // Treat a grapheme cluster as if it were a single character: |
| // the first base character, if there is one; otherwise the first character. |
| |
| setRule("4: GC -> FC"); |
| if (!grapheme.isBreak( source, offset)) return false; |
| |
| // Ignore interior Format characters. That is, ignore Format characters in all subsequent rules. |
| setRule("5: X Format* -> X"); |
| byte afterChar = getResolvedType(source.charAt(offset)); |
| if (afterChar == Format) return false; |
| |
| getGraphemeBases(breaker, source, offset, Format, context); |
| byte before = context.tBefore; |
| byte after = context.tAfter; |
| byte before2 = context.tBefore2; |
| byte after2 = context.tAfter2; |
| |
| // HACK COPY for rule collection! |
| if (collectingRules) { |
| setRule("6: ATerm × ( Numeric | Lower )"); |
| setRule("7: Upper ATerm × Upper"); |
| setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower"); |
| setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )"); |
| setRule("10: ( Term | ATerm ) Close* Sp × ( Sp | Sep )"); |
| setRule("11: ( Term | ATerm ) Close* Sp* ÷"); |
| setRule("12: Any × Any"); |
| collectingRules = false; |
| } |
| |
| // Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter, is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence. |
| |
| if (before == ATerm) { |
| setRule("6: ATerm × ( Numeric | Lower )"); |
| if (after == Lower || after == Numeric) return false; |
| setRule("7: Upper ATerm × Upper"); |
| if (DEBUG_GRAPHEMES) System.out.println(context + ", " + Upper); |
| if (before2 == Upper && after == Upper) return false; |
| } |
| |
| // The following cases are all handled together. |
| |
| // First we loop backwards, checking for the different types. |
| |
| MyBreakIterator graphemeIterator = new MyBreakIterator(grapheme); |
| graphemeIterator.set(source, offset); |
| |
| int state = 0; |
| int lookAfter = -1; |
| int cp; |
| byte t; |
| boolean gotSpace = false; |
| boolean gotClose = false; |
| |
| behindLoop: |
| while (true) { |
| cp = graphemeIterator.previousBase(); |
| if (cp == -1) break; |
| t = getResolvedType(cp); |
| if (SHOW_TYPE) System.out.println(ucd.getCodeAndName(cp) + ", " + getTypeID(cp)); |
| |
| if (t == Format) continue; // ignore all formats! |
| |
| switch (state) { |
| case 0: |
| if (t == Sp) { |
| // loop as long as we have Space |
| gotSpace = true; |
| continue behindLoop; |
| } else if (t == Close) { |
| gotClose = true; |
| state = 1; // go to close loop |
| continue behindLoop; |
| } |
| break; |
| case 1: |
| if (t == Close) { |
| // loop as long as we have Close |
| continue behindLoop; |
| } |
| break; |
| } |
| if (t == ATerm) { |
| lookAfter = ATerm; |
| } else if (t == STerm) { |
| lookAfter = STerm; |
| } |
| break; |
| } |
| |
| // if we didn't find ATerm or Term, bail |
| |
| if (lookAfter == -1) { |
| // Otherwise, do not break |
| // Any × Any (11) |
| setRule("12: Any × Any"); |
| return false; |
| } |
| |
| // ATerm Close* Sp*×(¬( OLetter))* Lower(8) |
| |
| // Break after sentence terminators, but include closing punctuation, trailing spaces, and (optionally) a paragraph separator. |
| // ( Term | ATerm ) Close*×( Close | Sp | Sep )(9) |
| // ( Term | ATerm ) Close* Sp×( Sp | Sep )(10) |
| // ( Term | ATerm ) Close* Sp*÷(11) |
| |
| |
| // We DID find one. Loop to see if the right side is ok. |
| |
| graphemeIterator.set(source, offset); |
| boolean isFirst = true; |
| while (true) { |
| cp = graphemeIterator.nextBase(); |
| if (cp == -1) break; |
| t = getResolvedType(cp); |
| if (SHOW_TYPE) System.out.println(ucd.getCodeAndName(cp) + ", " + getTypeID(cp)); |
| |
| if (t == Format) continue; // skip format characters! |
| |
| if (isFirst) { |
| isFirst = false; |
| if (lookAfter == ATerm && t == Upper) { |
| setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower"); |
| return false; |
| } |
| if (gotSpace) { |
| if (t == Sp || t == Sep) { |
| setRule("10: ( Term | ATerm ) Close* Sp × ( Sp | Sep )"); |
| return false; |
| } |
| } else if (t == Close || t == Sp || t == Sep) { |
| setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )"); |
| return false; |
| } |
| if (lookAfter == STerm) break; |
| } |
| |
| // at this point, we have an ATerm. All other conditions are ok, but we need to verify 6 |
| if (t != OLetter && t != Upper && t != Lower) continue; |
| if (t == Lower) { |
| setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower"); |
| return false; |
| } |
| break; |
| } |
| setRule("11: ( Term | ATerm ) Close* Sp* ÷"); |
| return true; |
| } |
| } |
| |
| static final boolean DEBUG_GRAPHEMES = false; |
| |
| static class MyBreakIterator { |
| int offset = 0; |
| String string = ""; |
| GenerateBreakTest breaker; |
| boolean recommended = true; |
| |
| MyBreakIterator(GenerateBreakTest breaker) { |
| this.breaker = breaker; // = new GenerateGraphemeBreakTest() |
| } |
| public MyBreakIterator set(String source, int offset) { |
| //if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(string) + "; " + offset); |
| string = source; |
| this.offset = offset; |
| return this; |
| } |
| |
| public int nextBase() { |
| if (offset >= string.length()) return -1; |
| int result = UTF16.charAt(string, offset); |
| for (++offset; offset < string.length(); ++offset) { |
| if (breaker.isBreak(string, offset)) break; |
| } |
| //if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(result)); |
| return result; |
| } |
| |
| public int previousBase() { |
| if (offset <= 0) return -1; |
| for (--offset; offset >= 0; --offset) { |
| if (breaker.isBreak(string, offset)) break; |
| } |
| int result = UTF16.charAt(string, offset); |
| //if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(result)); |
| return result; |
| } |
| } |
| /* |
| * |
| * if (false) { |
| |
| PrintWriter log = Utility.openPrintWriter("Diff.txt", Utility.UTF8_WINDOWS); |
| UnicodeSet Term = new UnicodeSet( |
| "[\\u0021\\u003F\\u0589\\u061F\\u06D4\\u0700\\u0701\\u0702\\u0964\\u1362\\u1367" |
| + "\\u1368\\u104A\\u104B\\u166E\\u1803\\u1809\\u203C\\u203D\\u2047\\u2048\\u2049" |
| + "\\u3002\\uFE52\\uFE57\\uFF01\\uFF0E\\uFF1F\\uFF61]"); |
| UnicodeSet terminal_punctuation = getSet(BINARY_PROPERTIES, Terminal_Punctuation); |
| UnicodeMap names = new UnicodeMap(); |
| names.add("Pd", getSet(CATEGORY, Pd)); |
| names.add("Ps", getSet(CATEGORY, Ps)); |
| names.add("Pe", getSet(CATEGORY, Pe)); |
| names.add("Pc", getSet(CATEGORY, Pc)); |
| names.add("Po", getSet(CATEGORY, Po)); |
| names.add("Pi", getSet(CATEGORY, Pi)); |
| names.add("Pf", getSet(CATEGORY, Pf)); |
| |
| Utility.showSetDifferences(log, "Term", Term, "Terminal_Punctuation", terminal_punctuation, true, true, names, ucd); |
| Utility.showSetDifferences(log, "Po", getSet(CATEGORY, Po), "Terminal_Punctuation", terminal_punctuation, true, true, names, ucd); |
| log.close(); |
| |
| if (true) return; |
| |
| UnicodeSet whitespace = getSet(BINARY_PROPERTIES, White_space); |
| UnicodeSet space = getSet(CATEGORY, Zs).addAll(getSet(CATEGORY, Zp)).addAll(getSet(CATEGORY, Zl)); |
| Utility.showSetDifferences("White_Space", whitespace, "Z", space, true, ucd); |
| |
| UnicodeSet isSpace = new UnicodeSet(); |
| UnicodeSet isSpaceChar = new UnicodeSet(); |
| UnicodeSet isWhitespace = new UnicodeSet(); |
| for (int i = 0; i <= 0xFFFF; ++i) { |
| if (Character.isSpace((char)i)) isSpace.add(i); |
| if (Character.isSpaceChar((char)i)) isSpaceChar.add(i); |
| if (Character.isWhitespace((char)i)) isWhitespace.add(i); |
| } |
| Utility.showSetDifferences("White_Space", whitespace, "isSpace", isSpace, true, ucd); |
| Utility.showSetDifferences("White_Space", whitespace, "isSpaceChar", isSpaceChar, true, ucd); |
| Utility.showSetDifferences("White_Space", whitespace, "isWhitespace", isWhitespace, true, ucd); |
| return; |
| } |
| |
| if (DEBUG) { |
| checkDecomps(); |
| |
| Utility.showSetNames("", new UnicodeSet("[\u034F\u00AD\u1806[:DI:]-[:Cs:]-[:Cn:]]"), true, ucd); |
| |
| System.out.println("*** Extend - Cf"); |
| |
| generateTerminalClosure(); |
| |
| GenerateWordBreakTest gwb = new GenerateWordBreakTest(); |
| PrintWriter systemPrintWriter = new PrintWriter(System.out); |
| gwb.printLine(systemPrintWriter, "n\u0308't", true, true, false); |
| systemPrintWriter.flush(); |
| //showSet("sepSet", GenerateSentenceBreakTest.sepSet); |
| //showSet("atermSet", GenerateSentenceBreakTest.atermSet); |
| //showSet("termSet", GenerateSentenceBreakTest.termSet); |
| } |
| |
| if (true) { |
| GenerateBreakTest foo = new GenerateLineBreakTest(); |
| //foo.isBreak("(\"Go.\") (He did)", 5, true); |
| foo.isBreak("\u4e00\u4300", 1, true); |
| /* |
| GenerateSentenceBreakTest foo = new GenerateSentenceBreakTest(); |
| //foo.isBreak("(\"Go.\") (He did)", 5, true); |
| foo.isBreak("3.4", 2, true); |
| * / |
| } |
| |
| new GenerateGraphemeBreakTest().run(); |
| new GenerateWordBreakTest().run(); |
| new GenerateLineBreakTest().run(); |
| new GenerateSentenceBreakTest().run(); |
| |
| //if (true) return; // cut short for now |
| |
| } |
| |
| */ |
| } |
| |