| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ********************************************************************** |
| * Copyright (C) 2008-2015, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * Date Name Description |
| * 05/11/2008 Andy Heninger Port from Java |
| ********************************************************************** |
| */ |
| |
| #include <utility> |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION |
| |
| #include "unicode/brkiter.h" |
| #include "unicode/localpointer.h" |
| #include "unicode/uchar.h" |
| #include "unicode/unifilt.h" |
| #include "unicode/uniset.h" |
| |
| #include "brktrans.h" |
| #include "cmemory.h" |
| #include "mutex.h" |
| #include "uprops.h" |
| #include "uinvchar.h" |
| #include "util.h" |
| #include "uvectr32.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) |
| |
| static const UChar SPACE = 32; // ' ' |
| |
| |
| /** |
| * Constructs a transliterator with the default delimiters '{' and |
| * '}'. |
| */ |
| BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : |
| Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), |
| cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) { |
| } |
| |
| |
| /** |
| * Destructor. |
| */ |
| BreakTransliterator::~BreakTransliterator() { |
| } |
| |
| /** |
| * Copy constructor. |
| */ |
| BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : |
| Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) { |
| } |
| |
| |
| /** |
| * Transliterator API. |
| */ |
| BreakTransliterator* BreakTransliterator::clone() const { |
| return new BreakTransliterator(*this); |
| } |
| |
| /** |
| * Implements {@link Transliterator#handleTransliterate}. |
| */ |
| void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, |
| UBool isIncremental ) const { |
| |
| UErrorCode status = U_ZERO_ERROR; |
| LocalPointer<BreakIterator> bi; |
| LocalPointer<UVector32> boundaries; |
| |
| { |
| Mutex m; |
| BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this); |
| boundaries = std::move(nonConstThis->cachedBoundaries); |
| bi = std::move(nonConstThis->cachedBI); |
| } |
| if (bi.isNull()) { |
| bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status)); |
| } |
| if (boundaries.isNull()) { |
| boundaries.adoptInstead(new UVector32(status)); |
| } |
| |
| if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) { |
| return; |
| } |
| |
| boundaries->removeAllElements(); |
| UnicodeString sText = replaceableAsString(text); |
| bi->setText(sText); |
| bi->preceding(offsets.start); |
| |
| // To make things much easier, we will stack the boundaries, and then insert at the end. |
| // generally, we won't need too many, since we will be filtered. |
| |
| int32_t boundary; |
| for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { |
| if (boundary == 0) continue; |
| // HACK: Check to see that preceeding item was a letter |
| |
| UChar32 cp = sText.char32At(boundary-1); |
| int type = u_charType(cp); |
| //System.out.println(Integer.toString(cp,16) + " (before): " + type); |
| if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; |
| |
| cp = sText.char32At(boundary); |
| type = u_charType(cp); |
| //System.out.println(Integer.toString(cp,16) + " (after): " + type); |
| if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; |
| |
| boundaries->addElement(boundary, status); |
| // printf("Boundary at %d\n", boundary); |
| } |
| |
| int delta = 0; |
| int lastBoundary = 0; |
| |
| if (boundaries->size() != 0) { // if we found something, adjust |
| delta = boundaries->size() * fInsertion.length(); |
| lastBoundary = boundaries->lastElementi(); |
| |
| // we do this from the end backwards, so that we don't have to keep updating. |
| |
| while (boundaries->size() > 0) { |
| boundary = boundaries->popi(); |
| text.handleReplaceBetween(boundary, boundary, fInsertion); |
| } |
| } |
| |
| // Now fix up the return values |
| offsets.contextLimit += delta; |
| offsets.limit += delta; |
| offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; |
| |
| // Return break iterator & boundaries vector to the cache. |
| { |
| Mutex m; |
| BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this); |
| if (nonConstThis->cachedBI.isNull()) { |
| nonConstThis->cachedBI = std::move(bi); |
| } |
| if (nonConstThis->cachedBoundaries.isNull()) { |
| nonConstThis->cachedBoundaries = std::move(boundaries); |
| } |
| } |
| |
| // TODO: do something with U_FAILURE(status); |
| // (need to look at transliterators overall, not just here.) |
| } |
| |
| // |
| // getInsertion() |
| // |
| const UnicodeString &BreakTransliterator::getInsertion() const { |
| return fInsertion; |
| } |
| |
| // |
| // setInsertion() |
| // |
| void BreakTransliterator::setInsertion(const UnicodeString &insertion) { |
| this->fInsertion = insertion; |
| } |
| |
| // |
| // replaceableAsString Hack to let break iterators work |
| // on the replaceable text from transliterators. |
| // In practice, the only real Replaceable type that we |
| // will be seeing is UnicodeString, so this function |
| // will normally be efficient. |
| // |
| UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { |
| UnicodeString s; |
| UnicodeString *rs = dynamic_cast<UnicodeString *>(&r); |
| if (rs != NULL) { |
| s = *rs; |
| } else { |
| r.extractBetween(0, r.length(), s); |
| } |
| return s; |
| } |
| |
| U_NAMESPACE_END |
| |
| #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |