| /** |
| ******************************************************************************* |
| * Copyright (C) 1996-2001, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java,v $ |
| * $Date: 2004/04/17 18:21:39 $ |
| * $Revision: 1.5 $ |
| * |
| ******************************************************************************* |
| */ |
| |
| package com.ibm.text.UCD; |
| |
| import java.util.*; |
| import java.io.*; |
| |
| import com.ibm.text.utility.*; |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.text.UnicodeSet; |
| |
| public class GenerateLineBreakTest implements UCD_Types { |
| |
| // COMMON STUFF for Hangul |
| static final byte hNot = -1, hL = 0, hV = 1, hT = 2, hLV = 3, hLVT = 4, hLIMIT = 5; |
| static final String[] hNames = {"L", "V", "T", "LV", "LVT"}; |
| |
| static byte getHangulType(int cp) { |
| if (Default.ucd().isLeadingJamo(cp)) return hL; |
| if (Default.ucd().isVowelJamo(cp)) return hV; |
| if (Default.ucd().isTrailingJamo(cp)) return hT; |
| if (Default.ucd().isHangulSyllable(cp)) { |
| if (Default.ucd().isDoubleHangul(cp)) return hLV; |
| return hLVT; |
| } |
| return hNot; |
| } |
| |
| //============================ |
| |
| protected String rule; |
| protected String fileName = "Line"; |
| |
| // all the other items are supplied in UCD_TYPES |
| static byte LB_L = LB_LIMIT + hL, LB_V = LB_LIMIT + hV, LB_T = LB_LIMIT + hT, |
| LB_LV = LB_LIMIT + hLV, LB_LVT = LB_LIMIT + hLVT, LB_SUP = LB_LIMIT + hLIMIT, |
| LB2_LIMIT = (byte)(LB_SUP + 1); |
| |
| String[] samples = new String[100]; |
| |
| |
| byte[] TypeOrder = { |
| LB_OP, LB_CL, LB_QU, LB_GL, LB_NS, LB_EX, LB_SY, LB_IS, LB_PR, LB_PO, |
| LB_NU, LB_AL, LB_ID, LB_IN, LB_HY, LB_BA, LB_BB, LB_B2, LB_ZW, LB_CM, |
| // missing from Pair Table |
| LB_SP, LB_BK, LB_CR, LB_LF, |
| // resolved types below |
| LB_CB, LB_AI, LB_SA, LB_SG, LB_XX, |
| // 3 JAMO CLASSES, plus supplementary |
| LB_L, LB_V, LB_T, LB_LV, LB_LVT, LB_SUP |
| }; |
| |
| public static void main(String[] args) throws IOException { |
| |
| new GenerateLineBreakTest().run(); |
| |
| new GenerateWordBreakTest().run(); |
| } |
| |
| // stuff that subclasses need to override |
| public void run() throws IOException { |
| findSamples(); |
| |
| // test individual cases |
| //printLine(out, samples[LB_ZW], "", samples[LB_CL]); |
| //printLine(out, samples[LB_ZW], " ", samples[LB_CL]); |
| |
| PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest.html", Utility.UTF8_WINDOWS); |
| out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>" |
| + fileName + "</title></head>"); |
| out.println("<body bgcolor='#FFFFFF'><h3>Current (fixed only for consistency):</h3>"); |
| |
| |
| |
| generateTable(out, false); |
| out.println("<h3>Recommended:</h3>"); |
| generateTable(out, true); |
| out.println("</body></html>"); |
| out.close(); |
| |
| String[] testCase = new String[50]; |
| // do main test |
| |
| for (int k = 0; k < 2; ++k) { |
| out = Utility.openPrintWriter(fileName + (k == 0 ? "Test_SHORT.txt" : "Test.txt"), Utility.LATIN1_WINDOWS); |
| int counter = 0; |
| |
| out.println("# Default " + fileName + " Break Test"); |
| out.println("# Generated: " + Default.getDate() + ", MED"); |
| out.println("#"); |
| out.println("# Format:"); |
| out.println("# <string> (# <comment>)? "); |
| out.println("# <string> contains hex Unicode code points, with "); |
| out.println("#\t" + BREAK + " wherever there is a break opportunity, and "); |
| out.println("#\t" + NOBREAK + " wherever there is not."); |
| out.println("# <comment> the format can change, but currently it shows:"); |
| out.println("#\t- the sample character name"); |
| out.println("#\t- (x) the line_break property* for the sample character"); |
| out.println("#\t- [x] the rule that determines whether there is a break or not"); |
| out.println("#"); |
| out.println("# Samples:"); |
| out.println("# The test currently takes all pairs of linebreak types*,"); |
| out.println("# picks a sample for each type, and generates three strings: "); |
| out.println("#\t- the pair alone"); |
| out.println("#\t- the pair alone with an imbeded space"); |
| out.println("#\t- the pair alone with embedded combining marks"); |
| out.println("# The sample for each type is simply the first code point (above NULL)"); |
| out.println("# with that property."); |
| out.println("# * Note:"); |
| out.println("#\t- SG is omitted"); |
| out.println("#\t- 3 different Jamo characters and a supplementary character are added"); |
| out.println("#\t The syllable types for the Jamo (L, V, T) are displayed in comments"); |
| out.println("#\t instead of the linebreak property"); |
| out.println("# These samples may be extended in the future."); |
| out.println("#"); |
| |
| for (int ii = 0; ii < getLimit(); ++ii) { |
| int i = TypeOrder[ii]; |
| if (i == LB_SG) continue; |
| String before = samples[i]; |
| |
| for (int jj = 0; jj < getLimit(); ++jj) { |
| Utility.dot(counter); |
| int j = TypeOrder[jj]; |
| if (j == LB_SG) continue; |
| String after = samples[j]; |
| // do line straight |
| int len = genTestItems(before, after, testCase); |
| for (int q = 0; q < len; ++q) { |
| printLine(out, testCase[q], k != 0 && q == 0, false); |
| ++counter; |
| } |
| } |
| } |
| out.println("# Lines: " + counter); |
| out.close(); |
| } |
| } |
| |
| // stuff that subclasses need to override |
| public int genTestItems(String before, String after, String[] results) { |
| results[0] = before + after; |
| results[1] = before + " " + after; |
| results[2] = before + "\u0301\u0308" + after; |
| return 3; |
| } |
| |
| // stuff that subclasses need to override |
| boolean skipType(byte type) { |
| return type == LB_AI || type == LB_SA || type == LB_SG || type == LB_XX; |
| } |
| |
| // stuff that subclasses need to override |
| public String getTypeID(int cp) { |
| byte result = getType(cp); |
| if (result == LB_SUP) return "SUP"; |
| if (result >= LB_LIMIT) return hNames[result - LB_LIMIT]; |
| return Default.ucd().getLineBreakID_fromIndex(result); |
| } |
| |
| // stuff that subclasses need to override |
| public byte getType(int cp) { |
| if (cp > 0xFFFF) return LB_SUP; |
| byte result = getHangulType(cp); |
| if (result != hNot) return (byte)(result + LB_LIMIT); |
| return Default.ucd().getLineBreak(cp); |
| } |
| |
| public int getLimit() { |
| return LB2_LIMIT; |
| } |
| |
| public int getTableLimit() { |
| return LB_SUP; // skip last; |
| } |
| |
| |
| public void generateTable(PrintWriter out, boolean recommended) { |
| String width = "width='" + (100 / (getTableLimit() + 1)) + "%'"; |
| out.print("<table border='1' cellspacing='0'><tr><th " + width + "></th>"); |
| byte type; |
| for (int i = 0; i < getTableLimit(); ++i) { |
| type = TypeOrder[i]; |
| if (skipType(type)) continue; |
| |
| String h = getTypeID(samples[TypeOrder[i]]); |
| out.print("<th " + width + ">" + h + "</th>"); |
| } |
| out.print("</tr>"); |
| String[] rule = new String[1]; |
| String[] rule2 = new String[1]; |
| for (int i = 0; i < getTableLimit(); ++i) { |
| type = TypeOrder[i]; |
| if (skipType(type)) continue; |
| |
| String before = samples[type]; |
| String line = "<tr><th>" + getTypeID(before) + "</th>"; |
| for (int j = 0; j < getTableLimit(); ++j) { |
| type = TypeOrder[j]; |
| if (skipType(type)) continue; |
| |
| String after = samples[type]; |
| String t = getTableEntry(before, after, recommended, rule); |
| String background = ""; |
| String t2 = getTableEntry(before, after, !recommended, rule2); |
| if (!t.equals(t2)) { |
| if (t.equals(NOBREAK)) { |
| background = " bgcolor='#CCFFFF'"; |
| } else { |
| background = " bgcolor='#FFFF00'"; |
| } |
| } else if (t.equals(NOBREAK)) { |
| background = " bgcolor='#CCCCFF'"; |
| } |
| line += "<th title='" + rule[0] + "'" + background + ">" + t + "</th>"; |
| } |
| out.println(line + "</tr>"); |
| } |
| out.println("</table>"); |
| } |
| |
| public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) { |
| String t = "_"; |
| boolean spaceBreak = isBreak(before + " " + after, before.length() + 1, recommended); |
| String spaceRule = rule; |
| |
| boolean spaceBreak2 = isBreak(before + " " + after, before.length(), recommended); |
| String spaceRule2 = rule; |
| |
| boolean normalBreak = isBreak(before + after, before.length(), recommended); |
| String normalRule = rule; |
| |
| if (!normalBreak) { |
| if (!spaceBreak && !spaceBreak2) { |
| t = "^"; |
| rule = spaceRule.equals(normalRule) ? normalRule : spaceRule + "/" + normalRule; |
| if (!spaceRule2.equals(normalRule) && !spaceRule2.equals(spaceRule)) { |
| rule += "/" + spaceRule2; |
| } |
| } else { |
| t = "%"; |
| rule = normalRule; |
| } |
| } |
| ruleOut[0] = rule; |
| return t; |
| } |
| |
| static final String BREAK = "\u00F7"; |
| static final String NOBREAK = "\u00D7"; |
| |
| public void printLine(PrintWriter out, String source, boolean comments, boolean recommended) { |
| int cp; |
| StringBuffer string = new StringBuffer(); |
| StringBuffer comment = new StringBuffer("\t# "); |
| String status = isBreak(source, 0, recommended) ? BREAK : NOBREAK; |
| string.append(status); |
| comment.append(' ').append(status).append(" [").append(rule).append(']'); |
| |
| for (int offset = 0; offset < source.length(); offset += UTF16.getCharCount(cp)) { |
| |
| cp = UTF16.charAt(source, offset); |
| if (string.length() > 0) { |
| string.append(' '); |
| comment.append(' '); |
| } |
| |
| string.append(Utility.hex(cp)); |
| comment.append(Default.ucd().getName(cp) + " (" + getTypeID(cp) + ")"); |
| |
| status = isBreak(source, offset + UTF16.getCharCount(cp), recommended) ? BREAK : NOBREAK; |
| string.append(' ').append(status); |
| comment.append(' ').append(status).append(" [").append(rule).append(']'); |
| } |
| |
| if (comments) string.append(comment); |
| out.println(string); |
| } |
| |
| public void findSamples() { |
| for (int i = 1; i <= 0x10FFFF; ++i) { |
| if (!Default.ucd().isAllocated(i)) continue; |
| if (0xD800 <= i && i <= 0xDFFF) continue; |
| if(i == 0x1100) { |
| System.out.print("here"); |
| } |
| byte lb = getType(i); |
| if (samples[lb] == null) { |
| samples[lb] = UTF16.valueOf(i); |
| } |
| } |
| for (int i = 0; i < TypeOrder.length; ++i) { |
| String sample = samples[i]; |
| System.out.println(getTypeID(sample) + ":\t" + Default.ucd().getCodeAndName(sample)); |
| } |
| } |
| |
| |
| public String getTypeID(String s) { |
| if (s == null) return "<null>"; |
| if (s.length() == 1) return getTypeID(s.charAt(0)); |
| StringBuffer result = new StringBuffer(); |
| int cp; |
| for (int i = 0; i < s.length(); i += UTF32.count16(cp)) { |
| cp = UTF32.char32At(s, i); |
| if (i > 0) result.append(" "); |
| result.append(getTypeID(cp)); |
| } |
| return result.toString(); |
| } |
| |
| public int findLastNon(String source, int offset, byte notLBType, boolean recommended) { |
| int cp; |
| for (int i = offset-1; i >= 0; i -= UTF16.getCharCount(cp)) { |
| cp = UTF16.charAt(source, i); |
| byte f = getResolvedType(cp, recommended); |
| if (f != notLBType) return i; |
| } |
| return -1; |
| } |
| |
| public byte getResolvedType (int cp, boolean recommended) { |
| // LB 1 Assign a line break category to each character of the input. |
| // Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm. |
| byte result = getType(cp); |
| switch (result) { |
| case LB_AI: result = LB_AI; break; |
| // case LB_CB: result = LB_ID; break; |
| case LB_SA: result = LB_AL; break; |
| // case LB_SG: result = LB_XX; break; Surrogates; will never occur |
| case LB_XX: result = LB_AL; break; |
| } |
| if (recommended) { |
| if (getHangulType(cp) != hNot) { |
| result = LB_ID; |
| } |
| } |
| |
| return result; |
| } |
| |
| public boolean onCodepointBoundary(String s, int offset) { |
| if (offset < 0 || offset > s.length()) return false; |
| if (offset == 0 || offset == s.length()) return true; |
| if (UTF16.isLeadSurrogate(s.charAt(offset-1)) |
| && UTF16.isTrailSurrogate(s.charAt(offset))) return false; |
| return true; |
| } |
| |
| // find out whether there is a break at offset |
| // WARNING: as a side effect, sets "rule" |
| |
| public boolean isBreak(String source, int offset, boolean recommended) { |
| |
| // LB 1 Assign a line break category to each character of the input. |
| // Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm. |
| // this is taken care of in the getResolvedType function |
| |
| // LB 2a Never break at the start of text |
| |
| rule="2a"; |
| if (offset <= 0) return false; |
| |
| // LB 2b Always break at the end of text |
| |
| rule="2b"; |
| if (offset >= source.length()) return true; |
| |
| |
| // UTF-16: never break in the middle of a code point |
| if (!onCodepointBoundary(source, offset)) return false; |
| |
| |
| // now get the character before and after, and their types |
| |
| |
| int cpBefore = UTF16.charAt(source, offset-1); |
| int cpAfter = UTF16.charAt(source, offset); |
| |
| byte before = getResolvedType(cpBefore, recommended); |
| byte after = getResolvedType(cpAfter, recommended); |
| |
| |
| rule="3a"; |
| // Always break after hard line breaks (but never between CR and LF). |
| // CR ^ LF |
| if (before == LB_CR && after == LB_LF) return false; |
| if (before == LB_BK || before == LB_LF || before == LB_CR) return true; |
| |
| //LB 3b Don’t break before hard line breaks. |
| rule="3b"; |
| if (after == LB_BK || after == LB_LF | after == LB_CR) return false; |
| |
| // LB 4 Don’t break before spaces or zero-width space. |
| // × SP |
| // × ZW |
| |
| rule="4"; |
| if (after == LB_SP || after == LB_ZW) return false; |
| |
| // LB 5 Break after zero-width space. |
| // ZW ÷ |
| rule="5"; |
| if (before == LB_ZW) return true; |
| |
| // LB 6 Don’t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos. |
| rule="6"; |
| if (after == LB_CM) return false; |
| |
| if (before == LB_L && (after == LB_L || after == LB_V || after == LB_LV || after == LB_LVT)) return false; |
| |
| if ((before == LB_LV || before == LB_V) && (after == LB_V || after == LB_T)) return false; |
| |
| if ((before == LB_LVT || before == LB_T) && (after == LB_T)) return false; |
| |
| boolean setBase = false; |
| if (before == LB_CM) { |
| setBase = true; |
| int backOffset = findLastNon(source, offset, LB_CM, recommended); |
| if (backOffset < 0) { |
| before = LB_ID; |
| } else { |
| before = getResolvedType(UTF16.charAt(source, backOffset), recommended); |
| } |
| } |
| |
| // LB 7 In all of the following rules, if a space is the base character for a combining mark, |
| // the space is changed to type ID. In other words, break before SP CM* in the same cases as |
| // one would break before an ID. |
| rule="7"; |
| if (setBase && before == LB_SP) before = LB_ID; |
| |
| // LB 8 Don’t break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. |
| // × CL, × EX, × IS, × SY |
| rule="8"; |
| if (after == LB_CL || after == LB_EX || after == LB_SY | after == LB_IS) return false; |
| |
| |
| // find the last non-space character; we will need it |
| byte lastNonSpace = before; |
| if (lastNonSpace == LB_SP) { |
| int backOffset = findLastNon(source, offset, LB_CM, recommended); |
| if (backOffset >= 0) { |
| lastNonSpace = getResolvedType(UTF16.charAt(source, backOffset), recommended); |
| } |
| } |
| |
| // LB 9 Don’t break after ‘[’, even after spaces. |
| // OP SP* × |
| rule="9"; |
| if (lastNonSpace == LB_OP) return false; |
| |
| // LB 10 Don’t break within ‘”[’, , even with intervening spaces. |
| // QU SP* × OP |
| rule="10"; |
| if (lastNonSpace == LB_QU && after == LB_OP) return false; |
| |
| // LB 11 Don’t break within ‘]h’, even with intervening spaces. |
| // CL SP* × NS |
| rule="11"; |
| if (lastNonSpace == LB_CL && after == LB_NS) return false; |
| |
| // LB 11a Don’t break within ‘——’, even with intervening spaces. |
| // B2 × B2 |
| rule="11a"; |
| if (lastNonSpace == LB_B2 && after == LB_B2) return false; |
| |
| |
| if (recommended) { |
| // LB 13 Don’t break before or after NBSP or WORD JOINER |
| // × GL |
| // GL × |
| |
| rule="11b"; |
| if (after == LB_GL || before == LB_GL) return false; |
| } |
| |
| // [Note: by this time, all of the "X" in the table are accounted for. We can safely break after spaces.] |
| |
| rule="12"; |
| // LB 12 Break after spaces |
| // SP ÷ |
| |
| if (before == LB_SP) return true; |
| |
| if (!recommended) { |
| // LB 13 Don’t break before or after NBSP or WORD JOINER |
| // × GL |
| // GL × |
| |
| rule="13"; |
| if (after == LB_GL || before == LB_GL) return false; |
| } |
| |
| rule="14"; |
| // LB 14 Don’t break before or after ‘”’ |
| // × QU |
| // QU × |
| if (before == LB_QU || after == LB_QU) return false; |
| |
| // LB 15 Don’t break before hyphen-minus, other hyphens, fixed-width spaces, |
| // small kana and other non- starters, or after acute accents: |
| // × BA |
| // × HY |
| // × NS |
| // BB × |
| |
| if (recommended) { |
| // LB 14a Break before and after CB |
| // CB ÷ |
| // ÷ CB |
| if (before == LB_CB || after == LB_CB) return true; |
| |
| } |
| |
| rule="15"; |
| if (after == LB_NS) return false; |
| if (after == LB_HY) return false; |
| if (after == LB_BA) return false; |
| if (before == LB_BB) return false; |
| |
| if (!recommended) { |
| // LB 15b Break after hyphen-minus, and before acute accents: |
| // HY ÷ |
| // ÷ BB |
| |
| rule="15b"; |
| if (before == LB_HY) return true; |
| if (after == LB_BB) return true; |
| } |
| |
| // LB 16 Don’t break between two ellipses, or between letters or numbers and ellipsis: |
| // AL × IN |
| // ID × IN |
| // IN × IN |
| // NU × IN |
| // Examples: ’9...’, ‘a...’, ‘H...’ |
| rule="16"; |
| if ((before == LB_NU || before == LB_AL || before == LB_ID) && after == LB_IN) return false; |
| if (before == LB_IN && after == LB_IN) return false; |
| |
| // Don't break alphanumerics. |
| // LB 17 Don’t break within ‘a9’, ‘3a’, or ‘H%’ |
| // ID × PO |
| // AL × NU |
| // NU × AL |
| // Numbers are of the form PR ? ( OP | HY ) ? NU (NU | IS) * CL ? PO ? |
| // Examples: $(12.35) 2,1234 (12)¢ 12.54¢ |
| // This is approximated with the following rules. (Some cases already handled above, |
| // like ‘9,’, ‘[9’.) |
| rule="17"; |
| if (before == LB_ID && after == LB_PO) return false; |
| if (before == LB_AL && after == LB_NU) return false; |
| if (before == LB_NU && after == LB_AL) return false; |
| |
| // LB 18 Don’t break between the following pairs of classes. |
| // CL × PO |
| // HY × NU |
| // IS × NU |
| // NU × NU |
| // NU × PO |
| // PR × AL |
| // PR × HY |
| // PR × ID |
| // PR × NU |
| // PR × OP |
| // SY × NU |
| // Example pairs: ‘$9’, ‘$[’, ‘$-‘, ‘-9’, ‘/9’, ‘99’, ‘,9’, ‘9%’ ‘]%’ |
| |
| rule="18"; |
| if (before == LB_CL && after == LB_PO) return false; |
| if (before == LB_HY && after == LB_NU) return false; |
| if (before == LB_IS && after == LB_NU) return false; |
| if (before == LB_NU && after == LB_NU) return false; |
| if (before == LB_NU && after == LB_PO) return false; |
| |
| if (before == LB_PR && after == LB_AL) return false; |
| if (before == LB_PR && after == LB_HY) return false; |
| if (before == LB_PR && after == LB_ID) return false; |
| if (before == LB_PR && after == LB_NU) return false; |
| if (before == LB_PR && after == LB_OP) return false; |
| |
| if (before == LB_SY && after == LB_NU) return false; |
| |
| if (recommended) { |
| // LB 15b Break after hyphen-minus, and before acute accents: |
| // HY ÷ |
| // ÷ BB |
| |
| rule="18b"; |
| if (before == LB_HY) return true; |
| if (after == LB_BB) return true; |
| } |
| |
| // LB 19 Don’t break between alphabetics (“at”) |
| // AL × AL |
| |
| rule="19"; |
| if (before == LB_AL && after == LB_AL) return false; |
| |
| // LB 20 Break everywhere else |
| // ALL ÷ |
| // ÷ ALL |
| |
| rule="20"; |
| return true; |
| } |
| |
| static class GenerateWordBreakTest extends GenerateLineBreakTest { |
| |
| static final byte CR = 0, LF = 1, Control = 2, Extend = 3, Link = 4, CGJ = 5, Base = 6, LetterBase = 7, Other = 8, |
| oLIMIT = 9, // RESET THIS IF LIST ABOVE CHANGES! |
| L = oLIMIT + hL, V = oLIMIT + hV, T = oLIMIT + hT, LV = oLIMIT + hLV, LVT = oLIMIT + hLVT, |
| LIMIT = LVT + 1; |
| |
| static final String[] Names = {"CR", "LF", "CTL", "Extend", "Link", "CGJ", "Base", "LetterBase", "Other" }; |
| |
| static UCDProperty extendProp = UnifiedBinaryProperty.make(DERIVED | GraphemeExtend); |
| static UCDProperty baseProp = UnifiedBinaryProperty.make(DERIVED | GraphemeBase); |
| static UCDProperty linkProp = UnifiedBinaryProperty.make(BINARY_PROPERTIES | GraphemeLink); |
| |
| { |
| fileName = "Word"; |
| TypeOrder = new byte[LIMIT]; |
| for (byte i = 0; i < TypeOrder.length; ++i) { |
| TypeOrder[i] = i; |
| } |
| } |
| |
| boolean skipType(byte type) { |
| return false; |
| } |
| |
| public int getLimit() { |
| return LIMIT; |
| } |
| |
| public int getTableLimit() { |
| return LIMIT; |
| } |
| |
| // stuff that subclasses need to override |
| public int genTestItems(String before, String after, String[] results) { |
| results[0] = before + after; |
| return 1; |
| } |
| |
| public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) { |
| boolean normalBreak = isBreak(before + after, before.length(), recommended); |
| String normalRule = rule; |
| ruleOut[0] = rule; |
| return normalBreak ? BREAK : NOBREAK; |
| } |
| |
| // stuff that subclasses need to override |
| public String getTypeID(int cp) { |
| byte type = getType(cp); |
| if (type >= oLIMIT) return hNames[type - oLIMIT]; |
| return Names[type]; |
| } |
| |
| // stuff that subclasses need to override |
| public byte getType(int cp) { |
| // single characters |
| if (cp == 0xA) return LF; |
| if (cp == 0xD) return CR; |
| if (cp == 0x034F) return CGJ; |
| if (cp == 0x2028 || cp == 0x2029) return Control; |
| |
| // Hangul |
| byte result = getHangulType(cp); |
| if (result != hNot) return (byte)(result + oLIMIT); |
| |
| // other properties |
| // category based |
| byte cat = Default.ucd().getCategory(cp); |
| if (cat == Cc) return Control; |
| if (cat == Cf) return Extend; |
| if (((1<<cat) & LETTER_MASK) != 0) return LetterBase; |
| |
| // other binary properties |
| if (linkProp.hasValue(cp)) return Link; |
| if (extendProp.hasValue(cp)) return Extend; |
| if (baseProp.hasValue(cp)) return Base; |
| |
| return Other; |
| } |
| |
| public byte getResolvedType(int cp, boolean recommended) { |
| return getType(cp); |
| } |
| |
| public boolean isBreak(String source, int offset, boolean recommended) { |
| rule="1"; |
| if (offset < 0 || offset > source.length()) return false; |
| if (offset == 0) return true; |
| |
| rule = "2"; |
| if (offset == source.length()) return true; |
| |
| // UTF-16: never break in the middle of a code point |
| if (!onCodepointBoundary(source, offset)) return false; |
| |
| // now get the character before and after, and their types |
| |
| |
| int cpBefore = UTF16.charAt(source, offset-1); |
| int cpAfter = UTF16.charAt(source, offset); |
| |
| byte before = getResolvedType(cpBefore, recommended); |
| byte after = getResolvedType(cpAfter, recommended); |
| |
| rule = "3"; |
| if (before == CR && after == LF) return false; |
| |
| rule = "4"; |
| if (before == CR || before == LF || before == Control |
| || after == Control || after == LF || after == CR) return true; |
| |
| rule = "6"; |
| if (before == L && (after == L || after == V || after == LV || after == LVT)) return false; |
| |
| rule = "7"; |
| if ((before == LV || before == V) && (after == V || after == T)) return false; |
| |
| rule = "8"; |
| if ((before == LVT || before == T) && (after == T)) return false; |
| |
| rule = "9"; |
| if (after == Extend) return false; |
| |
| if (recommended) { |
| if (after == Link || after == CGJ) return false; |
| } else { |
| |
| // Do not break around a CGJ. |
| rule = "10"; |
| if (before == CGJ && (after == Base |
| || after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT)) return false; |
| rule = "11"; |
| if (after == CGJ) return false; |
| |
| // Do not break between linking characters and letters, or before linking characters. This provides for Indic graphemes, where virama (halant) will link character clusters together. |
| |
| rule = "12"; |
| //Link Extend* × LetterBase (12) |
| if (after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT) { |
| int backOffset = findLastNon(source, offset, Extend, recommended); |
| if (backOffset >= 0) { |
| byte last = getResolvedType(UTF16.charAt(source, backOffset), recommended); |
| if (last == Link) return false; |
| } |
| } |
| |
| rule = "13"; |
| if (after == Link) return false; |
| } |
| |
| // Otherwise break after all characters. |
| rule = "14"; |
| return true; |
| |
| } |
| |
| } |
| } |