| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html#License |
| /* |
| ******************************************************************************* |
| * Copyright (C) 2014-2016, International Business Machines Corporation and |
| * others. All Rights Reserved. |
| ******************************************************************************* |
| */ |
| package com.ibm.icu.text; |
| |
| import com.ibm.icu.text.UnicodeSet.SpanCondition; |
| import com.ibm.icu.util.OutputInt; |
| |
| /** |
| * A helper class used to count, replace, and trim CharSequences based on UnicodeSet matches. |
| * An instance is immutable (and thus thread-safe) iff the source UnicodeSet is frozen. |
| * <p><b>Note:</b> The counting, deletion, and replacement depend on alternating a {@link SpanCondition} with |
| * its inverse. That is, the code spans, then spans for the inverse, then spans, and so on. |
| * For the inverse, the following mapping is used: |
| * <ul> |
| * <li>{@link UnicodeSet.SpanCondition#SIMPLE} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}</li> |
| * <li>{@link UnicodeSet.SpanCondition#CONTAINED} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}</li> |
| * <li>{@link UnicodeSet.SpanCondition#NOT_CONTAINED} → {@link UnicodeSet.SpanCondition#SIMPLE}</li> |
| * </ul> |
| * These are actually not complete inverses. However, the alternating works because there are no gaps. |
| * For example, with [a{ab}{bc}], you get the following behavior when scanning forward: |
| * |
| * <table border="1"> |
| * <tr><th>SIMPLE</th><td>xxx[ab]cyyy</td></tr> |
| * <tr><th>CONTAINED</th><td>xxx[abc]yyy</td></tr> |
| * <tr><th>NOT_CONTAINED</th><td>[xxx]ab[cyyy]</td></tr> |
| * </table> |
| * <p>So here is what happens when you alternate: |
| * |
| * <table border="1"> |
| * <tr><th>start</th><td>|xxxabcyyy</td></tr> |
| * <tr><th>NOT_CONTAINED</th><td>xxx|abcyyy</td></tr> |
| * <tr><th>CONTAINED</th><td>xxxabc|yyy</td></tr> |
| * <tr><th>NOT_CONTAINED</th><td>xxxabcyyy|</td></tr> |
| * </table> |
| * <p>The entire string is traversed. |
| * |
| * @stable ICU 54 |
| */ |
| public class UnicodeSetSpanner { |
| |
| private final UnicodeSet unicodeSet; |
| |
| /** |
| * Create a spanner from a UnicodeSet. For speed and safety, the UnicodeSet should be frozen. However, this class |
| * can be used with a non-frozen version to avoid the cost of freezing. |
| * |
| * @param source |
| * the original UnicodeSet |
| * |
| * @stable ICU 54 |
| */ |
| public UnicodeSetSpanner(UnicodeSet source) { |
| unicodeSet = source; |
| } |
| |
| /** |
| * Returns the UnicodeSet used for processing. It is frozen iff the original was. |
| * |
| * @return the construction set. |
| * |
| * @stable ICU 54 |
| */ |
| public UnicodeSet getUnicodeSet() { |
| return unicodeSet; |
| } |
| |
| |
| /** |
| * {@inheritDoc} |
| * |
| * @stable ICU 54 |
| */ |
| @Override |
| public boolean equals(Object other) { |
| return other instanceof UnicodeSetSpanner && unicodeSet.equals(((UnicodeSetSpanner) other).unicodeSet); |
| } |
| |
| /** |
| * {@inheritDoc} |
| * |
| * @stable ICU 54 |
| */ |
| @Override |
| public int hashCode() { |
| return unicodeSet.hashCode(); |
| } |
| |
| /** |
| * Options for replaceFrom and countIn to control how to treat each matched span. |
| * It is similar to whether one is replacing [abc] by x, or [abc]* by x. |
| * |
| * @stable ICU 54 |
| */ |
| public enum CountMethod { |
| /** |
| * Collapse spans. That is, modify/count the entire matching span as a single item, instead of separate |
| * set elements. |
| * |
| * @stable ICU 54 |
| */ |
| WHOLE_SPAN, |
| /** |
| * Use the smallest number of elements in the spanned range for counting and modification, |
| * based on the {@link UnicodeSet.SpanCondition}. |
| * If the set has no strings, this will be the same as the number of spanned code points. |
| * <p>For example, in the string "abab" with SpanCondition.SIMPLE: |
| * <ul> |
| * <li>spanning with [ab] will count four MIN_ELEMENTS.</li> |
| * <li>spanning with [{ab}] will count two MIN_ELEMENTS.</li> |
| * <li>spanning with [ab{ab}] will also count two MIN_ELEMENTS.</li> |
| * </ul> |
| * |
| * @stable ICU 54 |
| */ |
| MIN_ELEMENTS, |
| // Note: could in the future have an additional option MAX_ELEMENTS |
| } |
| |
| /** |
| * Returns the number of matching characters found in a character sequence, |
| * counting by CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE. |
| * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. |
| * @param sequence |
| * the sequence to count characters in |
| * @return the count. Zero if there are none. |
| * |
| * @stable ICU 54 |
| */ |
| public int countIn(CharSequence sequence) { |
| return countIn(sequence, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE); |
| } |
| |
| /** |
| * Returns the number of matching characters found in a character sequence, using SpanCondition.SIMPLE. |
| * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. |
| * @param sequence |
| * the sequence to count characters in |
| * @param countMethod |
| * whether to treat an entire span as a match, or individual elements as matches |
| * @return the count. Zero if there are none. |
| * |
| * @stable ICU 54 |
| */ |
| public int countIn(CharSequence sequence, CountMethod countMethod) { |
| return countIn(sequence, countMethod, SpanCondition.SIMPLE); |
| } |
| |
| /** |
| * Returns the number of matching characters found in a character sequence. |
| * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. |
| * @param sequence |
| * the sequence to count characters in |
| * @param countMethod |
| * whether to treat an entire span as a match, or individual elements as matches |
| * @param spanCondition |
| * the spanCondition to use. SIMPLE or CONTAINED means only count the elements in the span; |
| * NOT_CONTAINED is the reverse. |
| * <br><b>WARNING: </b> when a UnicodeSet contains strings, there may be unexpected behavior in edge cases. |
| * @return the count. Zero if there are none. |
| * |
| * @stable ICU 54 |
| */ |
| public int countIn(CharSequence sequence, CountMethod countMethod, SpanCondition spanCondition) { |
| int count = 0; |
| int start = 0; |
| SpanCondition skipSpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE |
| : SpanCondition.NOT_CONTAINED; |
| final int length = sequence.length(); |
| OutputInt spanCount = null; |
| while (start != length) { |
| int endOfSpan = unicodeSet.span(sequence, start, skipSpan); |
| if (endOfSpan == length) { |
| break; |
| } |
| if (countMethod == CountMethod.WHOLE_SPAN) { |
| start = unicodeSet.span(sequence, endOfSpan, spanCondition); |
| count += 1; |
| } else { |
| if (spanCount == null) { |
| spanCount = new OutputInt(); |
| } |
| start = unicodeSet.spanAndCount(sequence, endOfSpan, spanCondition, spanCount); |
| count += spanCount.value; |
| } |
| } |
| return count; |
| } |
| |
| /** |
| * Delete all the matching spans in sequence, using SpanCondition.SIMPLE |
| * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. |
| * @param sequence |
| * charsequence to replace matching spans in. |
| * @return modified string. |
| * |
| * @stable ICU 54 |
| */ |
| public String deleteFrom(CharSequence sequence) { |
| return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, SpanCondition.SIMPLE); |
| } |
| |
| /** |
| * Delete all matching spans in sequence, according to the spanCondition. |
| * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. |
| * @param sequence |
| * charsequence to replace matching spans in. |
| * @param spanCondition |
| * specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching (NOT_CONTAINED) |
| * @return modified string. |
| * |
| * @stable ICU 54 |
| */ |
| public String deleteFrom(CharSequence sequence, SpanCondition spanCondition) { |
| return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, spanCondition); |
| } |
| |
| /** |
| * Replace all matching spans in sequence by the replacement, |
| * counting by CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE. |
| * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. |
| * @param sequence |
| * charsequence to replace matching spans in. |
| * @param replacement |
| * replacement sequence. To delete, use "" |
| * @return modified string. |
| * |
| * @stable ICU 54 |
| */ |
| public String replaceFrom(CharSequence sequence, CharSequence replacement) { |
| return replaceFrom(sequence, replacement, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE); |
| } |
| |
| /** |
| * Replace all matching spans in sequence by replacement, according to the CountMethod, using SpanCondition.SIMPLE. |
| * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. |
| * |
| * @param sequence |
| * charsequence to replace matching spans in. |
| * @param replacement |
| * replacement sequence. To delete, use "" |
| * @param countMethod |
| * whether to treat an entire span as a match, or individual elements as matches |
| * @return modified string. |
| * |
| * @stable ICU 54 |
| */ |
| public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod) { |
| return replaceFrom(sequence, replacement, countMethod, SpanCondition.SIMPLE); |
| } |
| |
| /** |
| * Replace all matching spans in sequence by replacement, according to the countMethod and spanCondition. |
| * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. |
| * @param sequence |
| * charsequence to replace matching spans in. |
| * @param replacement |
| * replacement sequence. To delete, use "" |
| * @param countMethod |
| * whether to treat an entire span as a match, or individual elements as matches |
| * @param spanCondition |
| * specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching |
| * (NOT_CONTAINED) |
| * @return modified string. |
| * |
| * @stable ICU 54 |
| */ |
| public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod, |
| SpanCondition spanCondition) { |
| SpanCondition copySpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE |
| : SpanCondition.NOT_CONTAINED; |
| final boolean remove = replacement.length() == 0; |
| StringBuilder result = new StringBuilder(); |
| // TODO, we can optimize this to |
| // avoid this allocation unless needed |
| |
| final int length = sequence.length(); |
| OutputInt spanCount = null; |
| for (int endCopy = 0; endCopy != length;) { |
| int endModify; |
| if (countMethod == CountMethod.WHOLE_SPAN) { |
| endModify = unicodeSet.span(sequence, endCopy, spanCondition); |
| } else { |
| if (spanCount == null) { |
| spanCount = new OutputInt(); |
| } |
| endModify = unicodeSet.spanAndCount(sequence, endCopy, spanCondition, spanCount); |
| } |
| if (remove || endModify == 0) { |
| // do nothing |
| } else if (countMethod == CountMethod.WHOLE_SPAN) { |
| result.append(replacement); |
| } else { |
| for (int i = spanCount.value; i > 0; --i) { |
| result.append(replacement); |
| } |
| } |
| if (endModify == length) { |
| break; |
| } |
| endCopy = unicodeSet.span(sequence, endModify, copySpan); |
| result.append(sequence.subSequence(endModify, endCopy)); |
| } |
| return result.toString(); |
| } |
| |
| /** |
| * Options for the trim() method |
| * |
| * @stable ICU 54 |
| */ |
| public enum TrimOption { |
| /** |
| * Trim leading spans. |
| * |
| * @stable ICU 54 |
| */ |
| LEADING, |
| /** |
| * Trim leading and trailing spans. |
| * |
| * @stable ICU 54 |
| */ |
| BOTH, |
| /** |
| * Trim trailing spans. |
| * |
| * @stable ICU 54 |
| */ |
| TRAILING; |
| } |
| |
| /** |
| * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start and |
| * end of the string, using TrimOption.BOTH and SpanCondition.SIMPLE. For example: |
| * |
| * <pre> |
| * {@code |
| * |
| * new UnicodeSet("[ab]").trim("abacatbab")} |
| * </pre> |
| * |
| * ... returns {@code "cat"}. |
| * @param sequence |
| * the sequence to trim |
| * @return a subsequence |
| * |
| * @stable ICU 54 |
| */ |
| public CharSequence trim(CharSequence sequence) { |
| return trim(sequence, TrimOption.BOTH, SpanCondition.SIMPLE); |
| } |
| |
| /** |
| * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start or |
| * end of the string, using the trimOption and SpanCondition.SIMPLE. For example: |
| * |
| * <pre> |
| * {@code |
| * |
| * new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING)} |
| * </pre> |
| * |
| * ... returns {@code "catbab"}. |
| * |
| * @param sequence |
| * the sequence to trim |
| * @param trimOption |
| * LEADING, TRAILING, or BOTH |
| * @return a subsequence |
| * |
| * @stable ICU 54 |
| */ |
| public CharSequence trim(CharSequence sequence, TrimOption trimOption) { |
| return trim(sequence, trimOption, SpanCondition.SIMPLE); |
| } |
| |
| /** |
| * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start or |
| * end of the string, depending on the trimOption and spanCondition. For example: |
| * |
| * <pre> |
| * {@code |
| * |
| * new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING, SpanCondition.SIMPLE)} |
| * </pre> |
| * |
| * ... returns {@code "catbab"}. |
| * |
| * @param sequence |
| * the sequence to trim |
| * @param trimOption |
| * LEADING, TRAILING, or BOTH |
| * @param spanCondition |
| * SIMPLE, CONTAINED or NOT_CONTAINED |
| * @return a subsequence |
| * |
| * @stable ICU 54 |
| */ |
| public CharSequence trim(CharSequence sequence, TrimOption trimOption, SpanCondition spanCondition) { |
| int endLeadContained, startTrailContained; |
| final int length = sequence.length(); |
| if (trimOption != TrimOption.TRAILING) { |
| endLeadContained = unicodeSet.span(sequence, spanCondition); |
| if (endLeadContained == length) { |
| return ""; |
| } |
| } else { |
| endLeadContained = 0; |
| } |
| if (trimOption != TrimOption.LEADING) { |
| startTrailContained = unicodeSet.spanBack(sequence, spanCondition); |
| } else { |
| startTrailContained = length; |
| } |
| return endLeadContained == 0 && startTrailContained == length ? sequence : sequence.subSequence( |
| endLeadContained, startTrailContained); |
| } |
| |
| } |