| package com.ibm.text.UCD; |
| import com.ibm.icu.impl.CollectionUtilities; |
| import com.ibm.icu.text.Collator; |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.text.UnicodeSet; |
| import com.ibm.icu.util.ULocale; |
| |
| import java.util.BitSet; |
| import com.ibm.text.utility.*; |
| import java.io.PrintWriter; |
| |
| |
| public final class NFSkippable extends UCDProperty { |
| |
| static final boolean DEBUG = false; |
| |
| private Normalizer nf; |
| private Normalizer nfd; |
| private UCD ucd; |
| private boolean composes; |
| private int[] realTrailers = new int[100]; |
| private int realTrailerCount = 0; |
| |
| public NFSkippable(byte normalizerMode, UCD inputUCD) { |
| isStandard = false; |
| this.ucd = inputUCD; |
| nf = new Normalizer(normalizerMode, ucd.getVersion()); |
| name = nf.getName() + "_Skippable"; |
| shortName = nf.getName() + "_Skip"; |
| header = "# Derived Property: " + name |
| + "\r\n# Generated according to UAX #15." |
| + "\r\n# Characters that don't interact with any others in this normalization form." |
| + "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact." |
| + "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!"; |
| |
| nfd = new Normalizer(Normalizer.NFD, ucd.getVersion()); |
| composes = normalizerMode == Normalizer.NFC || normalizerMode == Normalizer.NFKC; |
| |
| // preprocess to find possible trailers |
| |
| if (composes) for (int cp2 = 0; cp2 <= 0x10FFFF; ++cp2) { |
| if (nf.isTrailing(cp2)) { |
| //System.out.println("Trailing: " + ucd.getCodeAndName(cp2)); |
| if (ucd.isNonLeadJamo(cp2)) { |
| //System.out.println("Jamo: " + ucd.getCodeAndName(cp2)); |
| continue; |
| } |
| realTrailers[realTrailerCount++] = cp2; |
| } |
| } |
| Utility.fixDot(); |
| //System.out.println("trailer count: " + realTrailerCount); |
| } |
| |
| /** A skippable character is<br> |
| * a) unassigned, or ALL of the following:<br> |
| * b) of combining class 0.<br> |
| * c) not decomposed by this normalization form.<br> |
| * AND if NKC or NFKC, <br> |
| * d) can never compose with a previous character.<br> |
| * e) can never compose with a following character.<br> |
| * f) can never change if another character is added. |
| * Example: a-breve might satisfy all but f, but if you |
| * add an ogonek it changes to a-ogonek + breve |
| */ |
| |
| String cause = ""; |
| |
| public boolean hasValue(int cp) { |
| // quick check on some special classes |
| if (DEBUG) cause = "\t\tunassigned"; |
| if (!ucd.isAssigned(cp)) return true; |
| |
| if (DEBUG) cause = "\t\tnf differs"; |
| if (!nf.isNormalized(cp)) return false; |
| |
| if (DEBUG) cause = "\t\tnon-zero cc"; |
| if (ucd.getCombiningClass(cp) != 0) return false; |
| |
| if (DEBUG) cause = ""; |
| if (!composes) return true; |
| |
| // now special checks for composing normalizers |
| if (DEBUG) cause = "\t\tleading"; |
| if (nf.isLeading(cp)) return false; |
| |
| if (DEBUG) cause = "\t\ttrailing"; |
| if (nf.isTrailing(cp)) return false; |
| |
| // OPTIMIZATION -- careful |
| // If there is no NFD decomposition, then this character's accents can't be |
| // "displaced", so we don't have to test further |
| |
| if (DEBUG) cause = "\t\tno decomp"; |
| if (nfd.isNormalized(cp)) return true; |
| |
| // OPTIMIZATION -- careful |
| // Hangul syllables are skippable IFF they are isLeadingJamoComposition |
| if (ucd.isHangulSyllable(cp)) return !ucd.isLeadingJamoComposition(cp); |
| |
| // We now see if adding another character causes a problem. |
| // brute force for now!! |
| // We do skip the trailing Jamo, since those never displace! |
| |
| StringBuffer base = new StringBuffer(UTF16.valueOf(cp)); |
| int baseLen = base.length(); |
| for (int i = 0; i < realTrailerCount; ++i) { |
| base.setLength(baseLen); // shorten if needed |
| base.append(UTF16.valueOf(realTrailers[i])); |
| String probe = base.toString(); |
| String result = nf.normalize(probe); |
| if (!result.equals(probe)) { |
| if (DEBUG) cause = "\t\tinteracts with " + ucd.getCodeAndName(realTrailers[i]); |
| return false; |
| } |
| } |
| |
| // passed the sieve, so we are ok |
| if (DEBUG) cause = ""; |
| return true; |
| } |
| |
| // both the following should go into UTF16 |
| |
| public static String replace(String source, int toReplace, int replacement) { |
| if (0 <= toReplace && toReplace <= 0xFFFF |
| && 0 <= replacement && replacement <= 0xFFFF) { |
| return source.replace((char)toReplace, (char)replacement); |
| } |
| return replace(source, UTF16.valueOf(toReplace), UTF16.valueOf(replacement)); |
| } |
| |
| public static String replace(String source, String toReplace, String replacement) { |
| int pos = 0; |
| StringBuffer result = new StringBuffer(source.length()); |
| while (true) { |
| int newPos = source.indexOf(toReplace, pos); |
| if (newPos >= 0) { |
| result.append(source.substring(pos, newPos)); |
| result.append(replacement); |
| pos = newPos + toReplace.length(); |
| } else if (pos != 0) { |
| result.append(source.substring(pos)); |
| return result.toString(); |
| } else { |
| return source; // no change necessary |
| } |
| } |
| } |
| |
| static void writeStringInPieces(PrintWriter pw, String s, String term) { |
| int start; |
| int end; |
| int lineLen = 64; |
| for (start = 0; ; start = end) { |
| if (start == 0) pw.print("\t \""); |
| else pw.print("\t+ \""); |
| end = s.length(); |
| if (end > start + lineLen) end = start + lineLen; |
| |
| // if we have a slash in the last 5 characters, backup |
| |
| int lastSlash = s.lastIndexOf('\\', end); |
| if (lastSlash >= end-5) end = lastSlash; |
| |
| // backup if we broke on a \ |
| |
| while (end > start && s.charAt(end-1) == '\\') --end; |
| |
| pw.print(s.substring(start, end)); |
| if (end == s.length()) { |
| pw.println('"' + term); |
| break; |
| } else { |
| pw.println('"'); |
| } |
| } |
| } |
| |
| static void testWriteStringInPieces() { |
| String test = |
| "[^\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6\\u00D9-\\u00DD" |
| + "\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6\\u00F9-\\u00FD\\u00F" |
| + "F-\\u010F\\u0112-\\u0125\\u0128-\\u0130\\u0134-\\u0137\\u0139-" |
| + "\\u013E\\u0143-\\u0148\\u014C-\\u0151\\u0154-\\u0165\\u0168-\\u017" |
| + "E\\u01A0-\\u01A1\\u01AF-\\u01B0\\u01CD-\\u01DC\\u01DE-\\u01E3\\u" |
| + "01E6-\\u01F0\\u01F4-\\u01F5\\u01F8-\\u021B\\u021E-\\u021F\\u0226"; |
| PrintWriter pw = new PrintWriter(System.out); |
| writeStringInPieces(pw,test,""); |
| writeStringInPieces(pw,replace(test, "\\", "\\\\"),""); |
| |
| pw.flush(); |
| } |
| |
| static int limit = 0x10FFFF; // full version = 10ffff, for testing may use smaller |
| |
| public static void main (String[] args) throws java.io.IOException { |
| |
| |
| PrintWriter out = Utility.openPrintWriter("NFSafeSets.txt", Utility.UTF8_WINDOWS); |
| out.println(Utility.BOM); |
| out.println("NFSafeSets"); |
| out.println("Version: " + Default.ucd().getVersion()); |
| out.println("Date: " + Default.getDate()); |
| out.println(); |
| |
| for (int mode = NFD_UnsafeStart; mode <= NFKC_UnsafeStart; ++mode) { |
| UCDProperty up = DerivedProperty.make(mode, Default.ucd()); |
| generateSet(out, "UNSAFE[" + Normalizer.getName((byte)(mode-NFD_UnsafeStart)) + "]", up); |
| } |
| |
| for (byte mode = NFD; mode <= NFKC; ++mode) { |
| NFSkippable skipper = new NFSkippable(mode, Default.ucd()); |
| generateSet(out, "SKIPPABLE[" + Normalizer.getName(mode) + "]", skipper); |
| } |
| System.out.println("Done"); |
| out.close(); |
| } |
| |
| static Collator UCA = Collator.getInstance(ULocale.ROOT); |
| |
| static void generateSet(PrintWriter out, String label, UCDProperty up) { |
| System.out.println("Generating: " + up.getName(NORMAL)); |
| UnicodeSet result = new UnicodeSet(); |
| for (int cp = 0; cp <= limit; ++cp) { |
| Utility.dot(cp); |
| if (up.hasValue(cp)) result.add(cp); |
| } |
| Utility.fixDot(); |
| |
| String rSet = result.toPattern(true); |
| rSet = replace(rSet, "\\U", "\\\\U"); |
| rSet = replace(rSet, "\\u", "\\\\u"); |
| out.println(label + " = new UnicodeSet("); |
| writeStringInPieces(out, rSet, ", false);"); |
| |
| if (true) { |
| rSet = result.toPattern(false); |
| } else { |
| rSet = CollectionUtilities.prettyPrint(result, true, null, null, UCA, UCA); |
| } |
| |
| out.println("/*Unicode: "); |
| writeStringInPieces(out, rSet, "*/"); |
| out.println(); |
| out.flush(); |
| System.out.println("Done"); |
| } |
| |
| /* |
| // DerivedProperty dp = new DerivedProperty(UCD.make(version)); |
| |
| System.out.println(skipper.getName(NORMAL)); |
| |
| UnicodeSet result = new UnicodeSet(); |
| for (int cp = 0; cp <= limit; ++cp) { |
| Utility.dot(cp); |
| if (skipper.hasProperty(cp)) result.add(cp); |
| } |
| Utility.fixDot(); |
| |
| String rSet = result.toPattern(true); |
| rSet = replace(rSet, "\\U", "\\\\U"); |
| out.println("\tSKIPPABLE[" + skipper.getName(NORMAL) |
| + "] = new UnicodeSet("); |
| writeStringInPieces(out, rSet, ", false);"); |
| out.println(); |
| |
| rSet = result.toPattern(false); |
| out.println("/*Unicode: "); |
| */ |
| //writeStringInPieces(out, rSet, "*/"); |
| /*out.println(); |
| out.flush(); |
| |
| if (false) { |
| NFSkippable skipper = new NFSkippable(Normalizer.NFC,""); |
| NFSkippable skipper2 = new NFSkippable(Normalizer.NFKC,""); |
| for (int cp = 0; cp <= 0x10FFFF; ++cp) { |
| if (cp > 0xFF) { |
| if (!skipper.ucd.isAssigned(cp)) continue; |
| byte cat = skipper.ucd.getCategory(cp); |
| if (cat == PRIVATE_USE || cat == SURROGATE) continue; |
| if (skipper.ucd.getCombiningClass(cp) != 0) continue; |
| if (!skipper.nf.isNormalized(cp)) continue; |
| if ((cp < 0xAC00 || cp > 0xAE00) |
| && cp != skipper.ucd.mapToRepresentative(cp, false)) continue; |
| } |
| |
| if (skipper2.hasProperty(cp) == skipper.hasProperty(cp)) continue; |
| |
| String status = (skipper.hasProperty(cp) ? " SKIPc " : "NOSKIPc ") |
| + (skipper2.hasProperty(cp) ? " SKIPkc " : "NOSKIPkc "); |
| System.out.println(status |
| + skipper.ucd.getCodeAndName(cp) |
| + skipper.cause); |
| } |
| } |
| */ |
| |
| } |