blob: 856689a48f502bb5a66dfac30f0f41330278ff53 [file] [log] [blame]
// © 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
// created: 2018may10 Markus W. Scherer
package com.ibm.icu.util;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values.
* This does not implement java.util.Map.
*
* @stable ICU 63
*/
public abstract class CodePointMap implements Iterable<CodePointMap.Range> {
/**
* Selectors for how getRange() should report value ranges overlapping with surrogates.
* Most users should use NORMAL.
*
* @see #getRange
* @stable ICU 63
*/
public enum RangeOption {
/**
* getRange() enumerates all same-value ranges as stored in the map.
* Most users should use this option.
*
* @stable ICU 63
*/
NORMAL,
/**
* getRange() enumerates all same-value ranges as stored in the map,
* except that lead surrogates (U+D800..U+DBFF) are treated as having the
* surrogateValue, which is passed to getRange() as a separate parameter.
* The surrogateValue is not transformed via filter().
* See {@link Character#isHighSurrogate}.
*
* <p>Most users should use NORMAL instead.
*
* <p>This option is useful for maps that map surrogate code *units* to
* special values optimized for UTF-16 string processing
* or for special error behavior for unpaired surrogates,
* but those values are not to be associated with the lead surrogate code *points*.
*
* @stable ICU 63
*/
FIXED_LEAD_SURROGATES,
/**
* getRange() enumerates all same-value ranges as stored in the map,
* except that all surrogates (U+D800..U+DFFF) are treated as having the
* surrogateValue, which is passed to getRange() as a separate parameter.
* The surrogateValue is not transformed via filter().
* See {@link Character#isSurrogate}.
*
* <p>Most users should use NORMAL instead.
*
* <p>This option is useful for maps that map surrogate code *units* to
* special values optimized for UTF-16 string processing
* or for special error behavior for unpaired surrogates,
* but those values are not to be associated with the lead surrogate code *points*.
*
* @stable ICU 63
*/
FIXED_ALL_SURROGATES
}
/**
* Callback function interface: Modifies a map value.
* Optionally called by getRange().
* The modified value will be returned by the getRange() function.
*
* <p>Can be used to ignore some of the value bits,
* make a filter for one of several values,
* return a value index computed from the map value, etc.
*
* @see #getRange
* @see #iterator
* @stable ICU 63
*/
public interface ValueFilter {
/**
* Modifies the map value.
*
* @param value map value
* @return modified value
* @stable ICU 63
*/
public int apply(int value);
}
/**
* Range iteration result data.
* Code points from start to end map to the same value.
* The value may have been modified by {@link ValueFilter#apply(int)},
* or it may be the surrogateValue if a RangeOption other than "normal" was used.
*
* @see #getRange
* @see #iterator
* @stable ICU 63
*/
public static final class Range {
private int start;
private int end;
private int value;
/**
* Constructor. Sets start and end to -1 and value to 0.
*
* @stable ICU 63
*/
public Range() {
start = end = -1;
value = 0;
}
/**
* @return the start code point
* @stable ICU 63
*/
public int getStart() { return start; }
/**
* @return the (inclusive) end code point
* @stable ICU 63
*/
public int getEnd() { return end; }
/**
* @return the range value
* @stable ICU 63
*/
public int getValue() { return value; }
/**
* Sets the range. When using {@link #iterator()},
* iteration will resume after the newly set end.
*
* @param start new start code point
* @param end new end code point
* @param value new value
* @stable ICU 63
*/
public void set(int start, int end, int value) {
this.start = start;
this.end = end;
this.value = value;
}
}
private final class RangeIterator implements Iterator<Range> {
private Range range = new Range();
@Override
public boolean hasNext() {
return -1 <= range.end && range.end < 0x10ffff;
}
@Override
public Range next() {
if (getRange(range.end + 1, null, range)) {
return range;
} else {
throw new NoSuchElementException();
}
}
@Override
public final void remove() {
throw new UnsupportedOperationException();
}
}
/**
* Iterates over code points of a string and fetches map values.
* This does not implement java.util.Iterator.
*
* <pre>
* void onString(CodePointMap map, CharSequence s, int start) {
* CodePointMap.StringIterator iter = map.stringIterator(s, start);
* while (iter.next()) {
* int end = iter.getIndex(); // code point from between start and end
* useValue(s, start, end, iter.getCodePoint(), iter.getValue());
* start = end;
* }
* }
* </pre>
*
* <p>This class is not intended for public subclassing.
*
* @stable ICU 63
*/
public class StringIterator {
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected CharSequence s;
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected int sIndex;
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected int c;
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected int value;
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected StringIterator(CharSequence s, int sIndex) {
this.s = s;
this.sIndex = sIndex;
c = -1;
value = 0;
}
/**
* Resets the iterator to a new string and/or a new string index.
*
* @param s string to iterate over
* @param sIndex string index where the iteration will start
* @stable ICU 63
*/
public void reset(CharSequence s, int sIndex) {
this.s = s;
this.sIndex = sIndex;
c = -1;
value = 0;
}
/**
* Reads the next code point, post-increments the string index,
* and gets a value from the map.
* Sets an implementation-defined error value if the code point is an unpaired surrogate.
*
* @return true if the string index was not yet at the end of the string;
* otherwise the iterator did not advance
* @stable ICU 63
*/
public boolean next() {
if (sIndex >= s.length()) {
return false;
}
c = Character.codePointAt(s, sIndex);
sIndex += Character.charCount(c);
value = get(c);
return true;
}
/**
* Reads the previous code point, pre-decrements the string index,
* and gets a value from the map.
* Sets an implementation-defined error value if the code point is an unpaired surrogate.
*
* @return true if the string index was not yet at the start of the string;
* otherwise the iterator did not advance
* @stable ICU 63
*/
public boolean previous() {
if (sIndex <= 0) {
return false;
}
c = Character.codePointBefore(s, sIndex);
sIndex -= Character.charCount(c);
value = get(c);
return true;
}
/**
* @return the string index
* @stable ICU 63
*/
public final int getIndex() { return sIndex; }
/**
* @return the code point
* @stable ICU 63
*/
public final int getCodePoint() { return c; }
/**
* @return the map value,
* or an implementation-defined error value if
* the code point is an unpaired surrogate
* @stable ICU 63
*/
public final int getValue() { return value; }
}
/**
* Protected no-args constructor.
*
* @stable ICU 63
*/
protected CodePointMap() {
}
/**
* Returns the value for a code point as stored in the map, with range checking.
* Returns an implementation-defined error value if c is not in the range 0..U+10FFFF.
*
* @param c the code point
* @return the map value,
* or an implementation-defined error value if
* the code point is not in the range 0..U+10FFFF
* @stable ICU 63
*/
public abstract int get(int c);
/**
* Sets the range object to a range of code points beginning with the start parameter.
* The range start is the same as the start input parameter
* (even if there are preceding code points that have the same value).
* The range end is the last code point such that
* all those from start to there have the same value.
* Returns false if start is not 0..U+10FFFF.
* Can be used to efficiently iterate over all same-value ranges in a map.
* (This is normally faster than iterating over code points and get()ting each value,
* but may be much slower than a data structure that stores ranges directly.)
*
* <p>If the {@link ValueFilter} parameter is not null, then
* the value to be delivered is passed through that filter, and the return value is the end
* of the range where all values are modified to the same actual value.
* The value is unchanged if that parameter is null.
*
* <p>Example:
* <pre>
* int start = 0;
* CodePointMap.Range range = new CodePointMap.Range();
* while (map.getRange(start, null, range)) {
* int end = range.getEnd();
* int value = range.getValue();
* // Work with the range start..end and its value.
* start = end + 1;
* }
* </pre>
*
* @param start range start
* @param filter an object that may modify the map data value,
* or null if the values from the map are to be used unmodified
* @param range the range object that will be set to the code point range and value
* @return true if start is 0..U+10FFFF; otherwise no new range is fetched
* @stable ICU 63
*/
public abstract boolean getRange(int start, ValueFilter filter, Range range);
/**
* Sets the range object to a range of code points beginning with the start parameter.
* The range start is the same as the start input parameter
* (even if there are preceding code points that have the same value).
* The range end is the last code point such that
* all those from start to there have the same value.
* Returns false if start is not 0..U+10FFFF.
*
* <p>Same as the simpler {@link #getRange(int, ValueFilter, Range)} but optionally
* modifies the range if it overlaps with surrogate code points.
*
* @param start range start
* @param option defines whether surrogates are treated normally,
* or as having the surrogateValue; usually {@link RangeOption#NORMAL}
* @param surrogateValue value for surrogates; ignored if option=={@link RangeOption#NORMAL}
* @param filter an object that may modify the map data value,
* or null if the values from the map are to be used unmodified
* @param range the range object that will be set to the code point range and value
* @return true if start is 0..U+10FFFF; otherwise no new range is fetched
* @stable ICU 63
*/
public boolean getRange(int start, RangeOption option, int surrogateValue,
ValueFilter filter, Range range) {
assert option != null;
if (!getRange(start, filter, range)) {
return false;
}
if (option == RangeOption.NORMAL) {
return true;
}
int surrEnd = option == RangeOption.FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff;
int end = range.end;
if (end < 0xd7ff || start > surrEnd) {
return true;
}
// The range overlaps with surrogates, or ends just before the first one.
if (range.value == surrogateValue) {
if (end >= surrEnd) {
// Surrogates followed by a non-surrValue range,
// or surrogates are part of a larger surrValue range.
return true;
}
} else {
if (start <= 0xd7ff) {
range.end = 0xd7ff; // Non-surrValue range ends before surrValue surrogates.
return true;
}
// Start is a surrogate with a non-surrValue code *unit* value.
// Return a surrValue code *point* range.
range.value = surrogateValue;
if (end > surrEnd) {
range.end = surrEnd; // Surrogate range ends before non-surrValue rest of range.
return true;
}
}
// See if the surrValue surrogate range can be merged with
// an immediately following range.
if (getRange(surrEnd + 1, filter, range) && range.value == surrogateValue) {
range.start = start;
return true;
}
range.start = start;
range.end = surrEnd;
range.value = surrogateValue;
return true;
}
/**
* Convenience iterator over same-map-value code point ranges.
* Same as looping over all ranges with {@link #getRange(int, ValueFilter, Range)}
* without filtering.
* Adjacent ranges have different map values.
*
* <p>The iterator always returns the same Range object.
*
* @return a Range iterator
* @stable ICU 63
*/
@Override
public Iterator<Range> iterator() {
return new RangeIterator();
}
/**
* Returns an iterator (not a java.util.Iterator) over code points of a string
* for fetching map values.
*
* @param s string to iterate over
* @param sIndex string index where the iteration will start
* @return the iterator
* @stable ICU 63
*/
public StringIterator stringIterator(CharSequence s, int sIndex) {
return new StringIterator(s, sIndex);
}
}