blob: c9cd0399a402036a042547c1308fc05ffbb8e721 [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* TailoredSet.java, ported from collationsets.h/.cpp
*
* C++ version created on: 2013feb09
* created by: Markus W. Scherer
*/
package com.ibm.icu.impl.coll;
import java.util.Iterator;
import com.ibm.icu.impl.Normalizer2Impl.Hangul;
import com.ibm.icu.impl.Trie2;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.CharsTrie;
import com.ibm.icu.util.CharsTrie.Entry;
/**
* Finds the set of characters and strings that sort differently in the tailoring
* from the base data.
*
* Every mapping in the tailoring needs to be compared to the base,
* because some mappings are copied for optimization, and
* all contractions for a character are copied if any contractions for that character
* are added, modified or removed.
*
* It might be simpler to re-parse the rule string, but:
* - That would require duplicating some of the from-rules builder code.
* - That would make the runtime code depend on the builder.
* - That would only work if we have the rule string, and we allow users to
* omit the rule string from data files.
*/
public final class TailoredSet {
private CollationData data;
private CollationData baseData;
private UnicodeSet tailored;
private StringBuilder unreversedPrefix = new StringBuilder();
private String suffix;
public TailoredSet(UnicodeSet t) {
tailored = t;
}
public void forData(CollationData d) {
data = d;
baseData = d.base;
assert (baseData != null);
// utrie2_enum(data->trie, NULL, enumTailoredRange, this);
Iterator<Trie2.Range> trieIterator = data.trie.iterator();
Trie2.Range range;
while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
enumTailoredRange(range.startCodePoint, range.endCodePoint, range.value, this);
}
}
private void enumTailoredRange(int start, int end, int ce32, TailoredSet ts) {
if (ce32 == Collation.FALLBACK_CE32) {
return; // fallback to base, not tailored
}
ts.handleCE32(start, end, ce32);
}
// Java porting note: ICU4C returns U_SUCCESS(error) and it's not applicable to ICU4J.
// Also, ICU4C requires handleCE32() to be public because it is used by the callback
// function (enumTailoredRange()). This is not necessary for Java implementation.
private void handleCE32(int start, int end, int ce32) {
assert (ce32 != Collation.FALLBACK_CE32);
if (Collation.isSpecialCE32(ce32)) {
ce32 = data.getIndirectCE32(ce32);
if (ce32 == Collation.FALLBACK_CE32) {
return;
}
}
do {
int baseCE32 = baseData.getFinalCE32(baseData.getCE32(start));
// Do not just continue if ce32 == baseCE32 because
// contractions and expansions in different data objects
// normally differ even if they have the same data offsets.
if (Collation.isSelfContainedCE32(ce32) && Collation.isSelfContainedCE32(baseCE32)) {
// fastpath
if (ce32 != baseCE32) {
tailored.add(start);
}
} else {
compare(start, ce32, baseCE32);
}
} while (++start <= end);
}
private void compare(int c, int ce32, int baseCE32) {
if (Collation.isPrefixCE32(ce32)) {
int dataIndex = Collation.indexFromCE32(ce32);
ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex));
if (Collation.isPrefixCE32(baseCE32)) {
int baseIndex = Collation.indexFromCE32(baseCE32);
baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
comparePrefixes(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2);
} else {
addPrefixes(data, c, data.contexts, dataIndex + 2);
}
} else if (Collation.isPrefixCE32(baseCE32)) {
int baseIndex = Collation.indexFromCE32(baseCE32);
baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
addPrefixes(baseData, c, baseData.contexts, baseIndex + 2);
}
if (Collation.isContractionCE32(ce32)) {
int dataIndex = Collation.indexFromCE32(ce32);
if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
ce32 = Collation.NO_CE32;
} else {
ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex));
}
if (Collation.isContractionCE32(baseCE32)) {
int baseIndex = Collation.indexFromCE32(baseCE32);
if ((baseCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
baseCE32 = Collation.NO_CE32;
} else {
baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
}
compareContractions(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2);
} else {
addContractions(c, data.contexts, dataIndex + 2);
}
} else if (Collation.isContractionCE32(baseCE32)) {
int baseIndex = Collation.indexFromCE32(baseCE32);
baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
addContractions(c, baseData.contexts, baseIndex + 2);
}
int tag;
if (Collation.isSpecialCE32(ce32)) {
tag = Collation.tagFromCE32(ce32);
assert (tag != Collation.PREFIX_TAG);
assert (tag != Collation.CONTRACTION_TAG);
// Currently, the tailoring data builder does not write offset tags.
// They might be useful for saving space,
// but they would complicate the builder,
// and in tailorings we assume that performance of tailored characters is more important.
assert (tag != Collation.OFFSET_TAG);
} else {
tag = -1;
}
int baseTag;
if (Collation.isSpecialCE32(baseCE32)) {
baseTag = Collation.tagFromCE32(baseCE32);
assert (baseTag != Collation.PREFIX_TAG);
assert (baseTag != Collation.CONTRACTION_TAG);
} else {
baseTag = -1;
}
// Non-contextual mappings, expansions, etc.
if (baseTag == Collation.OFFSET_TAG) {
// We might be comparing a tailoring CE which is a copy of
// a base offset-tag CE, via the [optimize [set]] syntax
// or when a single-character mapping was copied for tailored contractions.
// Offset tags always result in long-primary CEs,
// with common secondary/tertiary weights.
if (!Collation.isLongPrimaryCE32(ce32)) {
add(c);
return;
}
long dataCE = baseData.ces[Collation.indexFromCE32(baseCE32)];
long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE);
if (Collation.primaryFromLongPrimaryCE32(ce32) != p) {
add(c);
return;
}
}
if (tag != baseTag) {
add(c);
return;
}
if (tag == Collation.EXPANSION32_TAG) {
int length = Collation.lengthFromCE32(ce32);
int baseLength = Collation.lengthFromCE32(baseCE32);
if (length != baseLength) {
add(c);
return;
}
int idx0 = Collation.indexFromCE32(ce32);
int idx1 = Collation.indexFromCE32(baseCE32);
for (int i = 0; i < length; ++i) {
if (data.ce32s[idx0 + i] != baseData.ce32s[idx1 + i]) {
add(c);
break;
}
}
} else if (tag == Collation.EXPANSION_TAG) {
int length = Collation.lengthFromCE32(ce32);
int baseLength = Collation.lengthFromCE32(baseCE32);
if (length != baseLength) {
add(c);
return;
}
int idx0 = Collation.indexFromCE32(ce32);
int idx1 = Collation.indexFromCE32(baseCE32);
for (int i = 0; i < length; ++i) {
if (data.ces[idx0 + i] != baseData.ces[idx1 + i]) {
add(c);
break;
}
}
} else if (tag == Collation.HANGUL_TAG) {
StringBuilder jamos = new StringBuilder();
int length = Hangul.decompose(c, jamos);
if (tailored.contains(jamos.charAt(0)) || tailored.contains(jamos.charAt(1))
|| (length == 3 && tailored.contains(jamos.charAt(2)))) {
add(c);
}
} else if (ce32 != baseCE32) {
add(c);
}
}
private void comparePrefixes(int c, CharSequence p, int pidx, CharSequence q, int qidx) {
// Parallel iteration over prefixes of both tables.
CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator();
CharsTrie.Iterator basePrefixes = new CharsTrie(q, qidx).iterator();
String tp = null; // Tailoring prefix.
String bp = null; // Base prefix.
// Use a string with a U+FFFF as the limit sentinel.
// U+FFFF is untailorable and will not occur in prefixes.
String none = "\uffff";
Entry te = null, be = null;
for (;;) {
if (tp == null) {
if (prefixes.hasNext()) {
te = prefixes.next();
tp = te.chars.toString();
} else {
te = null;
tp = none;
}
}
if (bp == null) {
if (basePrefixes.hasNext()) {
be = basePrefixes.next();
bp = be.chars.toString();
} else {
be = null;
bp = none;
}
}
if (tp == none && bp == none) {
break;
}
int cmp = tp.compareTo(bp);
if (cmp < 0) {
// tp occurs in the tailoring but not in the base.
assert (te != null);
addPrefix(data, tp, c, te.value);
te = null;
tp = null;
} else if (cmp > 0) {
// bp occurs in the base but not in the tailoring.
assert (be != null);
addPrefix(baseData, bp, c, be.value);
be = null;
bp = null;
} else {
setPrefix(tp);
assert (te != null && be != null);
compare(c, te.value, be.value);
resetPrefix();
te = be = null;
tp = bp = null;
}
}
}
private void compareContractions(int c, CharSequence p, int pidx, CharSequence q, int qidx) {
// Parallel iteration over suffixes of both tables.
CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator();
CharsTrie.Iterator baseSuffixes = new CharsTrie(q, qidx).iterator();
String ts = null; // Tailoring suffix.
String bs = null; // Base suffix.
// Use a string with two U+FFFF as the limit sentinel.
// U+FFFF is untailorable and will not occur in contractions except maybe
// as a single suffix character for a root-collator boundary contraction.
String none = "\uffff\uffff";
Entry te = null, be = null;
for (;;) {
if (ts == null) {
if (suffixes.hasNext()) {
te = suffixes.next();
ts = te.chars.toString();
} else {
te = null;
ts = none;
}
}
if (bs == null) {
if (baseSuffixes.hasNext()) {
be = baseSuffixes.next();
bs = be.chars.toString();
} else {
be = null;
bs = none;
}
}
if (ts == none && bs == none) {
break;
}
int cmp = ts.compareTo(bs);
if (cmp < 0) {
// ts occurs in the tailoring but not in the base.
addSuffix(c, ts);
te = null;
ts = null;
} else if (cmp > 0) {
// bs occurs in the base but not in the tailoring.
addSuffix(c, bs);
be = null;
bs = null;
} else {
suffix = ts;
compare(c, te.value, be.value);
suffix = null;
te = be = null;
ts = bs = null;
}
}
}
private void addPrefixes(CollationData d, int c, CharSequence p, int pidx) {
CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator();
while (prefixes.hasNext()) {
Entry e = prefixes.next();
addPrefix(d, e.chars, c, e.value);
}
}
private void addPrefix(CollationData d, CharSequence pfx, int c, int ce32) {
setPrefix(pfx);
ce32 = d.getFinalCE32(ce32);
if (Collation.isContractionCE32(ce32)) {
int idx = Collation.indexFromCE32(ce32);
addContractions(c, d.contexts, idx + 2);
}
tailored.add(new StringBuilder(unreversedPrefix.appendCodePoint(c)));
resetPrefix();
}
private void addContractions(int c, CharSequence p, int pidx) {
CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator();
while (suffixes.hasNext()) {
Entry e = suffixes.next();
addSuffix(c, e.chars);
}
}
private void addSuffix(int c, CharSequence sfx) {
tailored.add(new StringBuilder(unreversedPrefix).appendCodePoint(c).append(sfx));
}
private void add(int c) {
if (unreversedPrefix.length() == 0 && suffix == null) {
tailored.add(c);
} else {
StringBuilder s = new StringBuilder(unreversedPrefix);
s.appendCodePoint(c);
if (suffix != null) {
s.append(suffix);
}
tailored.add(s);
}
}
// Prefixes are reversed in the data structure.
private void setPrefix(CharSequence pfx) {
unreversedPrefix.setLength(0);
unreversedPrefix.append(pfx).reverse();
}
private void resetPrefix() {
unreversedPrefix.setLength(0);
}
}