blob: 862994d0162227e4481f177ca171780c7dd8301c [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 2012-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* FCDIterCollationIterator.java, ported from uitercollationiterator.h/.cpp
*
* C++ version created on: 2012sep23 (from utf16collationiterator.h)
* created by: Markus W. Scherer
*/
package com.ibm.icu.impl.coll;
import com.ibm.icu.impl.Normalizer2Impl;
import com.ibm.icu.text.UCharacterIterator;
/**
* Incrementally checks the input text for FCD and normalizes where necessary.
*/
public final class FCDIterCollationIterator extends IterCollationIterator {
public FCDIterCollationIterator(CollationData data, boolean numeric,
UCharacterIterator ui, int startIndex) {
super(data, numeric, ui);
state = State.ITER_CHECK_FWD;
start = startIndex;
nfcImpl = data.nfcImpl;
}
@Override
public void resetToOffset(int newOffset) {
super.resetToOffset(newOffset);
start = newOffset;
state = State.ITER_CHECK_FWD;
}
@Override
public int getOffset() {
if(state.compareTo(State.ITER_CHECK_BWD) <= 0) {
return iter.getIndex();
} else if(state == State.ITER_IN_FCD_SEGMENT) {
return pos;
} else if(pos == 0) {
return start;
} else {
return limit;
}
}
@Override
public int nextCodePoint() {
int c;
for(;;) {
if(state == State.ITER_CHECK_FWD) {
c = iter.next();
if(c < 0) {
return c;
}
if(CollationFCD.hasTccc(c)) {
if(CollationFCD.maybeTibetanCompositeVowel(c) ||
CollationFCD.hasLccc(iter.current())) {
iter.previous();
if(!nextSegment()) {
return Collation.SENTINEL_CP;
}
continue;
}
}
if(isLeadSurrogate(c)) {
int trail = iter.next();
if(isTrailSurrogate(trail)) {
return Character.toCodePoint((char)c, (char)trail);
} else if(trail >= 0) {
iter.previous();
}
}
return c;
} else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
c = iter.nextCodePoint();
pos += Character.charCount(c);
assert(c >= 0);
return c;
} else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 &&
pos != normalized.length()) {
c = normalized.codePointAt(pos);
pos += Character.charCount(c);
return c;
} else {
switchToForward();
}
}
}
@Override
public int previousCodePoint() {
int c;
for(;;) {
if(state == State.ITER_CHECK_BWD) {
c = iter.previous();
if(c < 0) {
start = pos = 0;
state = State.ITER_IN_FCD_SEGMENT;
return Collation.SENTINEL_CP;
}
if(CollationFCD.hasLccc(c)) {
int prev = Collation.SENTINEL_CP;
if(CollationFCD.maybeTibetanCompositeVowel(c) ||
CollationFCD.hasTccc(prev = iter.previous())) {
iter.next();
if(prev >= 0) {
iter.next();
}
if(!previousSegment()) {
return Collation.SENTINEL_CP;
}
continue;
}
// hasLccc(trail)=true for all trail surrogates
if(isTrailSurrogate(c)) {
if(prev < 0) {
prev = iter.previous();
}
if(isLeadSurrogate(prev)) {
return Character.toCodePoint((char)prev, (char)c);
}
}
if(prev >= 0) {
iter.next();
}
}
return c;
} else if(state == State.ITER_IN_FCD_SEGMENT && pos != start) {
c = iter.previousCodePoint();
pos -= Character.charCount(c);
assert(c >= 0);
return c;
} else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos != 0) {
c = normalized.codePointBefore(pos);
pos -= Character.charCount(c);
return c;
} else {
switchToBackward();
}
}
}
@Override
protected long handleNextCE32() {
int c;
for(;;) {
if(state == State.ITER_CHECK_FWD) {
c = iter.next();
if(c < 0) {
return NO_CP_AND_CE32;
}
if(CollationFCD.hasTccc(c)) {
if(CollationFCD.maybeTibetanCompositeVowel(c) ||
CollationFCD.hasLccc(iter.current())) {
iter.previous();
if(!nextSegment()) {
c = Collation.SENTINEL_CP;
return Collation.FALLBACK_CE32;
}
continue;
}
}
break;
} else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
c = iter.next();
++pos;
assert(c >= 0);
break;
} else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 &&
pos != normalized.length()) {
c = normalized.charAt(pos++);
break;
} else {
switchToForward();
}
}
return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead((char)c));
}
@Override
protected char handleGetTrailSurrogate() {
if(state.compareTo(State.ITER_IN_FCD_SEGMENT) <= 0) {
int trail = iter.next();
if(isTrailSurrogate(trail)) {
if(state == State.ITER_IN_FCD_SEGMENT) { ++pos; }
} else if(trail >= 0) {
iter.previous();
}
return (char)trail;
} else {
assert(pos < normalized.length());
char trail;
if(Character.isLowSurrogate(trail = normalized.charAt(pos))) { ++pos; }
return trail;
}
}
@Override
protected void forwardNumCodePoints(int num) {
// Specify the class to avoid a virtual-function indirection.
// In Java, we would declare this class final.
while(num > 0 && nextCodePoint() >= 0) {
--num;
}
}
@Override
protected void backwardNumCodePoints(int num) {
// Specify the class to avoid a virtual-function indirection.
// In Java, we would declare this class final.
while(num > 0 && previousCodePoint() >= 0) {
--num;
}
}
/**
* Switches to forward checking if possible.
*/
private void switchToForward() {
assert(state == State.ITER_CHECK_BWD ||
(state == State.ITER_IN_FCD_SEGMENT && pos == limit) ||
(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == normalized.length()));
if(state == State.ITER_CHECK_BWD) {
// Turn around from backward checking.
start = pos = iter.getIndex();
if(pos == limit) {
state = State.ITER_CHECK_FWD; // Check forward.
} else { // pos < limit
state = State.ITER_IN_FCD_SEGMENT; // Stay in FCD segment.
}
} else {
// Reached the end of the FCD segment.
if(state == State.ITER_IN_FCD_SEGMENT) {
// The input text segment is FCD, extend it forward.
} else {
// The input text segment needed to be normalized.
// Switch to checking forward from it.
if(state == State.IN_NORM_ITER_AT_START) {
iter.moveIndex(limit - start);
}
start = limit;
}
state = State.ITER_CHECK_FWD;
}
}
/**
* Extends the FCD text segment forward or normalizes around pos.
* @return true if success
*/
private boolean nextSegment() {
assert(state == State.ITER_CHECK_FWD);
// The input text [start..(iter index)[ passes the FCD check.
pos = iter.getIndex();
// Collect the characters being checked, in case they need to be normalized.
if(s == null) {
s = new StringBuilder();
} else {
s.setLength(0);
}
int prevCC = 0;
for(;;) {
// Fetch the next character and its fcd16 value.
int c = iter.nextCodePoint();
if(c < 0) { break; }
int fcd16 = nfcImpl.getFCD16(c);
int leadCC = fcd16 >> 8;
if(leadCC == 0 && s.length() != 0) {
// FCD boundary before this character.
iter.previousCodePoint();
break;
}
s.appendCodePoint(c);
if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
// Fails FCD check. Find the next FCD boundary and normalize.
for(;;) {
c = iter.nextCodePoint();
if(c < 0) { break; }
if(nfcImpl.getFCD16(c) <= 0xff) {
iter.previousCodePoint();
break;
}
s.appendCodePoint(c);
}
normalize(s);
start = pos;
limit = pos + s.length();
state = State.IN_NORM_ITER_AT_LIMIT;
pos = 0;
return true;
}
prevCC = fcd16 & 0xff;
if(prevCC == 0) {
// FCD boundary after the last character.
break;
}
}
limit = pos + s.length();
assert(pos != limit);
iter.moveIndex(-s.length());
state = State.ITER_IN_FCD_SEGMENT;
return true;
}
/**
* Switches to backward checking.
*/
private void switchToBackward() {
assert(state == State.ITER_CHECK_FWD ||
(state == State.ITER_IN_FCD_SEGMENT && pos == start) ||
(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == 0));
if(state == State.ITER_CHECK_FWD) {
// Turn around from forward checking.
limit = pos = iter.getIndex();
if(pos == start) {
state = State.ITER_CHECK_BWD; // Check backward.
} else { // pos > start
state = State.ITER_IN_FCD_SEGMENT; // Stay in FCD segment.
}
} else {
// Reached the start of the FCD segment.
if(state == State.ITER_IN_FCD_SEGMENT) {
// The input text segment is FCD, extend it backward.
} else {
// The input text segment needed to be normalized.
// Switch to checking backward from it.
if(state == State.IN_NORM_ITER_AT_LIMIT) {
iter.moveIndex(start - limit);
}
limit = start;
}
state = State.ITER_CHECK_BWD;
}
}
/**
* Extends the FCD text segment backward or normalizes around pos.
* @return true if success
*/
private boolean previousSegment() {
assert(state == State.ITER_CHECK_BWD);
// The input text [(iter index)..limit[ passes the FCD check.
pos = iter.getIndex();
// Collect the characters being checked, in case they need to be normalized.
if(s == null) {
s = new StringBuilder();
} else {
s.setLength(0);
}
int nextCC = 0;
for(;;) {
// Fetch the previous character and its fcd16 value.
int c = iter.previousCodePoint();
if(c < 0) { break; }
int fcd16 = nfcImpl.getFCD16(c);
int trailCC = fcd16 & 0xff;
if(trailCC == 0 && s.length() != 0) {
// FCD boundary after this character.
iter.nextCodePoint();
break;
}
s.appendCodePoint(c);
if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
// Fails FCD check. Find the previous FCD boundary and normalize.
while(fcd16 > 0xff) {
c = iter.previousCodePoint();
if(c < 0) { break; }
fcd16 = nfcImpl.getFCD16(c);
if(fcd16 == 0) {
iter.nextCodePoint();
break;
}
s.appendCodePoint(c);
}
s.reverse();
normalize(s);
limit = pos;
start = pos - s.length();
state = State.IN_NORM_ITER_AT_START;
pos = normalized.length();
return true;
}
nextCC = fcd16 >> 8;
if(nextCC == 0) {
// FCD boundary before the following character.
break;
}
}
start = pos - s.length();
assert(pos != start);
iter.moveIndex(s.length());
state = State.ITER_IN_FCD_SEGMENT;
return true;
}
private void normalize(CharSequence s) {
if(normalized == null) {
normalized = new StringBuilder();
}
// NFD without argument checking.
nfcImpl.decompose(s, normalized);
}
private enum State {
/**
* The input text [start..(iter index)[ passes the FCD check.
* Moving forward checks incrementally.
* pos & limit are undefined.
*/
ITER_CHECK_FWD,
/**
* The input text [(iter index)..limit[ passes the FCD check.
* Moving backward checks incrementally.
* start & pos are undefined.
*/
ITER_CHECK_BWD,
/**
* The input text [start..limit[ passes the FCD check.
* pos tracks the current text index.
*/
ITER_IN_FCD_SEGMENT,
/**
* The input text [start..limit[ failed the FCD check and was normalized.
* pos tracks the current index in the normalized string.
* The text iterator is at the limit index.
*/
IN_NORM_ITER_AT_LIMIT,
/**
* The input text [start..limit[ failed the FCD check and was normalized.
* pos tracks the current index in the normalized string.
* The text iterator is at the start index.
*/
IN_NORM_ITER_AT_START
}
private State state;
private int start;
private int pos;
private int limit;
private final Normalizer2Impl nfcImpl;
private StringBuilder s;
private StringBuilder normalized;
}