icu4c/source/common/rbbi_cache.cpp - external/github.com/unicode-org/icu - Git at Google

 // Copyright (C) 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html

 // file: rbbi_cache.cpp

 #include "unicode/utypes.h"

 #if !UCONFIG_NO_BREAK_ITERATION

 #include "unicode/ubrk.h"
 #include "unicode/rbbi.h"

 #include "rbbi_cache.h"

 #include "brkeng.h"
 #include "cmemory.h"
 #include "rbbidata.h"
 #include "rbbirb.h"
 #include "uassert.h"
 #include "uvectr32.h"

 U_NAMESPACE_BEGIN

 /*
  * DictionaryCache implementation
  */

 RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
         fBI(bi), fBreaks(status), fPositionInCache(-1),
         fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
 }

 RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
 }

 void RuleBasedBreakIterator::DictionaryCache::reset() {
     fPositionInCache = -1;
     fStart = 0;
     fLimit = 0;
     fFirstRuleStatusIndex = 0;
     fOtherRuleStatusIndex = 0;
     fBreaks.removeAllElements();
 }

 UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
     if (fromPos >= fLimit || fromPos < fStart) {
         fPositionInCache = -1;
         return FALSE;
     }

     // Sequential iteration, move from previous boundary to the following

     int32_t r = 0;
     if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
         ++fPositionInCache;
         if (fPositionInCache >= fBreaks.size()) {
             fPositionInCache = -1;
             return FALSE;
         }
         r = fBreaks.elementAti(fPositionInCache);
         U_ASSERT(r > fromPos);
         *result = r;
         *statusIndex = fOtherRuleStatusIndex;
         return TRUE;
     }

     // Random indexing. Linear search for the boundary following the given position.

     for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
         r= fBreaks.elementAti(fPositionInCache);
         if (r > fromPos) {
             *result = r;
             *statusIndex = fOtherRuleStatusIndex;
             return TRUE;
         }
     }
     UPRV_UNREACHABLE;
 }


 UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
     if (fromPos <= fStart || fromPos > fLimit) {
         fPositionInCache = -1;
         return FALSE;
     }

     if (fromPos == fLimit) {
         fPositionInCache = fBreaks.size() - 1;
         if (fPositionInCache >= 0) {
             U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos);
         }
     }

     int32_t r;
     if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
         --fPositionInCache;
         r = fBreaks.elementAti(fPositionInCache);
         U_ASSERT(r < fromPos);
         *result = r;
         *statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
         return TRUE;
     }

     if (fPositionInCache == 0) {
         fPositionInCache = -1;
         return FALSE;
     }

     for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
         r = fBreaks.elementAti(fPositionInCache);
         if (r < fromPos) {
             *result = r;
             *statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
             return TRUE;
         }
     }
     UPRV_UNREACHABLE;
 }

 void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,
                                        int32_t firstRuleStatus, int32_t otherRuleStatus) {
     if ((endPos - startPos) <= 1) {
         return;
     }

     reset();
     fFirstRuleStatusIndex = firstRuleStatus;
     fOtherRuleStatusIndex = otherRuleStatus;

     int32_t rangeStart = startPos;
     int32_t rangeEnd = endPos;

     uint16_t    category;
     int32_t     current;
     UErrorCode  status = U_ZERO_ERROR;
     int32_t     foundBreakCount = 0;
     UText      *text = &fBI->fText;

     // Loop through the text, looking for ranges of dictionary characters.
     // For each span, find the appropriate break engine, and ask it to find
     // any breaks within the span.

     utext_setNativeIndex(text, rangeStart);
     UChar32     c = utext_current32(text);
     category = ucptrie_get(fBI->fData->fTrie, c);
     uint32_t dictStart = fBI->fData->fForwardTable->fDictCategoriesStart;

     while(U_SUCCESS(status)) {
         while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd
                 && (category < dictStart)) {
             utext_next32(text);           // TODO: cleaner loop structure.
             c = utext_current32(text);
             category = ucptrie_get(fBI->fData->fTrie, c);
         }
         if (current >= rangeEnd) {
             break;
         }

         // We now have a dictionary character. Get the appropriate language object
         // to deal with it.
         const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c);

         // Ask the language object if there are any breaks. It will add them to the cache and
         // leave the text pointer on the other side of its range, ready to search for the next one.
         if (lbe != NULL) {
             foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks);
         }

         // Reload the loop variables for the next go-round
         c = utext_current32(text);
         category = ucptrie_get(fBI->fData->fTrie, c);
     }

     // If we found breaks, ensure that the first and last entries are
     // the original starting and ending position. And initialize the
     // cache iteration position to the first entry.

     // printf("foundBreakCount = %d\n", foundBreakCount);
     if (foundBreakCount > 0) {
         U_ASSERT(foundBreakCount == fBreaks.size());
         if (startPos < fBreaks.elementAti(0)) {
             // The dictionary did not place a boundary at the start of the segment of text.
             // Add one now. This should not commonly happen, but it would be easy for interactions
             // of the rules for dictionary segments and the break engine implementations to
             // inadvertently cause it. Cover it here, just in case.
             fBreaks.insertElementAt(startPos, 0, status);
         }
         if (endPos > fBreaks.peeki()) {
             fBreaks.push(endPos, status);
         }
         fPositionInCache = 0;
         // Note: Dictionary matching may extend beyond the original limit.
         fStart = fBreaks.elementAti(0);
         fLimit = fBreaks.peeki();
     } else {
         // there were no language-based breaks, even though the segment contained
         // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
         // for this range will fail, and the calling code will fall back to the rule based boundaries.
     }
 }


 /*
  *   BreakCache implemetation
  */

 RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
         fBI(bi), fSideBuffer(status) {
     reset();
 }


 RuleBasedBreakIterator::BreakCache::~BreakCache() {
 }


 void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) {
     fStartBufIdx = 0;
     fEndBufIdx = 0;
     fTextIdx = pos;
     fBufIdx = 0;
     fBoundaries[0] = pos;
     fStatuses[0] = (uint16_t)ruleStatus;
 }


 int32_t  RuleBasedBreakIterator::BreakCache::current() {
     fBI->fPosition = fTextIdx;
     fBI->fRuleStatusIndex = fStatuses[fBufIdx];
     fBI->fDone = FALSE;
     return fTextIdx;
 }


 void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) {
     if (U_FAILURE(status)) {
         return;
     }
     if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
         // startPos is in the cache. Do a next() from that position.
         // TODO: an awkward set of interactions with bi->fDone
         //       seek() does not clear it; it can't because of interactions with populateNear().
         //       next() does not clear it in the fast-path case, where everything matters. Maybe it should.
         //       So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
         fBI->fDone = false;
         next();
     }
     return;
 }


 void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) {
     if (U_FAILURE(status)) {
         return;
     }
     if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
         if (startPos == fTextIdx) {
             previous(status);
         } else {
             // seek() leaves the BreakCache positioned at the preceding boundary
             //        if the requested position is between two boundaries.
             // current() pushes the BreakCache position out to the BreakIterator itself.
             U_ASSERT(startPos > fTextIdx);
             current();
         }
     }
     return;
 }


 /*
  * Out-of-line code for BreakCache::next().
  * Cache does not already contain the boundary
  */
 void RuleBasedBreakIterator::BreakCache::nextOL() {
     fBI->fDone = !populateFollowing();
     fBI->fPosition = fTextIdx;
     fBI->fRuleStatusIndex = fStatuses[fBufIdx];
     return;
 }


 void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) {
     if (U_FAILURE(status)) {
         return;
     }
     int32_t initialBufIdx = fBufIdx;
     if (fBufIdx == fStartBufIdx) {
         // At start of cache. Prepend to it.
         populatePreceding(status);
     } else {
         // Cache already holds the next boundary
         fBufIdx = modChunkSize(fBufIdx - 1);
         fTextIdx = fBoundaries[fBufIdx];
     }
     fBI->fDone = (fBufIdx == initialBufIdx);
     fBI->fPosition = fTextIdx;
     fBI->fRuleStatusIndex = fStatuses[fBufIdx];
     return;
 }


 UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
     if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
         return FALSE;
     }
     if (pos == fBoundaries[fStartBufIdx]) {
         // Common case: seek(0), from BreakIterator::first()
         fBufIdx = fStartBufIdx;
         fTextIdx = fBoundaries[fBufIdx];
         return TRUE;
     }
     if (pos == fBoundaries[fEndBufIdx]) {
         fBufIdx = fEndBufIdx;
         fTextIdx = fBoundaries[fBufIdx];
         return TRUE;
     }

     int32_t min = fStartBufIdx;
     int32_t max = fEndBufIdx;
     while (min != max) {
         int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
         probe = modChunkSize(probe);
         if (fBoundaries[probe] > pos) {
             max = probe;
         } else {
             min = modChunkSize(probe + 1);
         }
     }
     U_ASSERT(fBoundaries[max] > pos);
     fBufIdx = modChunkSize(max - 1);
     fTextIdx = fBoundaries[fBufIdx];
     U_ASSERT(fTextIdx <= pos);
     return TRUE;
 }


 UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) {
     if (U_FAILURE(status)) {
         return FALSE;
     }
     U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);

     // Find a boundary somewhere in the vicinity of the requested position.
     // Depending on the safe rules and the text data, it could be either before, at, or after
     // the requested position.


     // If the requested position is not near already cached positions, clear the existing cache,
     // find a near-by boundary and begin new cache contents there.

     if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
         int32_t aBoundary = 0;
         int32_t ruleStatusIndex = 0;
         if (position > 20) {
             int32_t backupPos = fBI->handleSafePrevious(position);

             if (backupPos > 0) {
                 // Advance to the boundary following the backup position.
                 // There is a complication: the safe reverse rules identify pairs of code points
                 // that are safe. If advancing from the safe point moves forwards by less than
                 // two code points, we need to advance one more time to ensure that the boundary
                 // is good, including a correct rules status value.
                 //
                 fBI->fPosition = backupPos;
                 aBoundary = fBI->handleNext();
                 if (aBoundary <= backupPos + 4) {
                     // +4 is a quick test for possibly having advanced only one codepoint.
                     // Four being the length of the longest potential code point, a supplementary in UTF-8
                     utext_setNativeIndex(&fBI->fText, aBoundary);
                     if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
                         // The initial handleNext() only advanced by a single code point. Go again.
                         aBoundary = fBI->handleNext();   // Safe rules identify safe pairs.
                     }
                 }
                 ruleStatusIndex = fBI->fRuleStatusIndex;
             }
         }
         reset(aBoundary, ruleStatusIndex);        // Reset cache to hold aBoundary as a single starting point.
     }

     // Fill in boundaries between existing cache content and the new requested position.

     if (fBoundaries[fEndBufIdx] < position) {
         // The last position in the cache precedes the requested position.
         // Add following position(s) to the cache.
         while (fBoundaries[fEndBufIdx] < position) {
             if (!populateFollowing()) {
                 UPRV_UNREACHABLE;
             }
         }
         fBufIdx = fEndBufIdx;                      // Set iterator position to the end of the buffer.
         fTextIdx = fBoundaries[fBufIdx];           // Required because populateFollowing may add extra boundaries.
         while (fTextIdx > position) {              // Move backwards to a position at or preceding the requested pos.
             previous(status);
         }
         return true;
     }

     if (fBoundaries[fStartBufIdx] > position) {
         // The first position in the cache is beyond the requested position.
         // back up more until we get a boundary <= the requested position.
         while (fBoundaries[fStartBufIdx] > position) {
             populatePreceding(status);
         }
         fBufIdx = fStartBufIdx;                    // Set iterator position to the start of the buffer.
         fTextIdx = fBoundaries[fBufIdx];           // Required because populatePreceding may add extra boundaries.
         while (fTextIdx < position) {              // Move forwards to a position at or following the requested pos.
             next();
         }
         if (fTextIdx > position) {
             // If position is not itself a boundary, the next() loop above will overshoot.
             // Back up one, leaving cache position at the boundary preceding the requested position.
             previous(status);
         }
         return true;
     }

     U_ASSERT(fTextIdx == position);
     return true;
 }


 UBool RuleBasedBreakIterator::BreakCache::populateFollowing() {
     int32_t fromPosition = fBoundaries[fEndBufIdx];
     int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx];
     int32_t pos = 0;
     int32_t ruleStatusIdx = 0;

     if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
         addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
         return TRUE;
     }

     fBI->fPosition = fromPosition;
     pos = fBI->handleNext();
     if (pos == UBRK_DONE) {
         return FALSE;
     }

     ruleStatusIdx = fBI->fRuleStatusIndex;
     if (fBI->fDictionaryCharCount > 0) {
         // The text segment obtained from the rules includes dictionary characters.
         // Subdivide it, with subdivided results going into the dictionary cache.
         fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
         if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
             addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
             return TRUE;
             // TODO: may want to move a sizable chunk of dictionary cache to break cache at this point.
             //       But be careful with interactions with populateNear().
         }
     }

     // Rule based segment did not include dictionary characters.
     // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
     //    meaning that we didn't take the return, above.
     // Add its end point to the cache.
     addFollowing(pos, ruleStatusIdx, UpdateCachePosition);

     // Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
     //    (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
     //
     for (int count=0; count<6; ++count) {
         pos = fBI->handleNext();
         if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) {
             break;
         }
         addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition);
     }

     return TRUE;
 }


 UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) {
     if (U_FAILURE(status)) {
         return FALSE;
     }

     int32_t fromPosition = fBoundaries[fStartBufIdx];
     if (fromPosition == 0) {
         return FALSE;
     }

     int32_t position = 0;
     int32_t positionStatusIdx = 0;

     if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) {
         addPreceding(position, positionStatusIdx, UpdateCachePosition);
         return TRUE;
     }

     int32_t backupPosition = fromPosition;

     // Find a boundary somewhere preceding the first already-cached boundary
     do {
         backupPosition = backupPosition - 30;
         if (backupPosition <= 0) {
             backupPosition = 0;
         } else {
             backupPosition = fBI->handleSafePrevious(backupPosition);
         }
         if (backupPosition == UBRK_DONE || backupPosition == 0) {
             position = 0;
             positionStatusIdx = 0;
         } else {
             // Advance to the boundary following the backup position.
             // There is a complication: the safe reverse rules identify pairs of code points
             // that are safe. If advancing from the safe point moves forwards by less than
             // two code points, we need to advance one more time to ensure that the boundary
             // is good, including a correct rules status value.
             //
             fBI->fPosition = backupPosition;
             position = fBI->handleNext();
             if (position <= backupPosition + 4) {
                 // +4 is a quick test for possibly having advanced only one codepoint.
                 // Four being the length of the longest potential code point, a supplementary in UTF-8
                 utext_setNativeIndex(&fBI->fText, position);
                 if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) {
                     // The initial handleNext() only advanced by a single code point. Go again.
                     position = fBI->handleNext();   // Safe rules identify safe pairs.
                 }
             }
             positionStatusIdx = fBI->fRuleStatusIndex;
         }
     } while (position >= fromPosition);

     // Find boundaries between the one we just located and the first already-cached boundary
     // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..

     fSideBuffer.removeAllElements();
     fSideBuffer.addElement(position, status);
     fSideBuffer.addElement(positionStatusIdx, status);

     do {
         int32_t prevPosition = fBI->fPosition = position;
         int32_t prevStatusIdx = positionStatusIdx;
         position = fBI->handleNext();
         positionStatusIdx = fBI->fRuleStatusIndex;
         if (position == UBRK_DONE) {
             break;
         }

         UBool segmentHandledByDictionary = FALSE;
         if (fBI->fDictionaryCharCount != 0) {
             // Segment from the rules includes dictionary characters.
             // Subdivide it, with subdivided results going into the dictionary cache.
             int32_t dictSegEndPosition = position;
             fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
             while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) {
                 segmentHandledByDictionary = true;
                 U_ASSERT(position > prevPosition);
                 if (position >= fromPosition) {
                     break;
                 }
                 U_ASSERT(position <= dictSegEndPosition);
                 fSideBuffer.addElement(position, status);
                 fSideBuffer.addElement(positionStatusIdx, status);
                 prevPosition = position;
             }
             U_ASSERT(position==dictSegEndPosition || position>=fromPosition);
         }

         if (!segmentHandledByDictionary && position < fromPosition) {
             fSideBuffer.addElement(position, status);
             fSideBuffer.addElement(positionStatusIdx, status);
         }
     } while (position < fromPosition);

     // Move boundaries from the side buffer to the main circular buffer.
     UBool success = FALSE;
     if (!fSideBuffer.isEmpty()) {
         positionStatusIdx = fSideBuffer.popi();
         position = fSideBuffer.popi();
         addPreceding(position, positionStatusIdx, UpdateCachePosition);
         success = TRUE;
     }

     while (!fSideBuffer.isEmpty()) {
         positionStatusIdx = fSideBuffer.popi();
         position = fSideBuffer.popi();
         if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
             // No space in circular buffer to hold a new preceding result while
             // also retaining the current cache (iteration) position.
             // Bailing out is safe; the cache will refill again if needed.
             break;
         }
     }

     return success;
 }


 void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
     U_ASSERT(position > fBoundaries[fEndBufIdx]);
     U_ASSERT(ruleStatusIdx <= UINT16_MAX);
     int32_t nextIdx = modChunkSize(fEndBufIdx + 1);
     if (nextIdx == fStartBufIdx) {
         fStartBufIdx = modChunkSize(fStartBufIdx + 6);    // TODO: experiment. Probably revert to 1.
     }
     fBoundaries[nextIdx] = position;
     fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);
     fEndBufIdx = nextIdx;
     if (update == UpdateCachePosition) {
         // Set current position to the newly added boundary.
         fBufIdx = nextIdx;
         fTextIdx = position;
     } else {
         // Retaining the original cache position.
         // Check if the added boundary wraps around the buffer, and would over-write the original position.
         // It's the responsibility of callers of this function to not add too many.
         U_ASSERT(nextIdx != fBufIdx);
     }
 }

 bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
     U_ASSERT(position < fBoundaries[fStartBufIdx]);
     U_ASSERT(ruleStatusIdx <= UINT16_MAX);
     int32_t nextIdx = modChunkSize(fStartBufIdx - 1);
     if (nextIdx == fEndBufIdx) {
         if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
             // Failure. The insertion of the new boundary would claim the buffer position that is the
             // current iteration position. And we also want to retain the current iteration position.
             // (The buffer is already completely full of entries that precede the iteration position.)
             return false;
         }
         fEndBufIdx = modChunkSize(fEndBufIdx - 1);
     }
     fBoundaries[nextIdx] = position;
     fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);
     fStartBufIdx = nextIdx;
     if (update == UpdateCachePosition) {
         fBufIdx = nextIdx;
         fTextIdx = position;
     }
     return true;
 }


 void RuleBasedBreakIterator::BreakCache::dumpCache() {
 #ifdef RBBI_DEBUG
     RBBIDebugPrintf("fTextIdx:%d   fBufIdx:%d\n", fTextIdx, fBufIdx);
     for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) {
         RBBIDebugPrintf("%d  %d\n", i, fBoundaries[i]);
         if (i == fEndBufIdx) {
             break;
         }
     }
 #endif
 }

 U_NAMESPACE_END

 #endif // #if !UCONFIG_NO_BREAK_ITERATION
	// Copyright (C) 2016 and later: Unicode, Inc. and others.
	// License & terms of use: http://www.unicode.org/copyright.html

	// file: rbbi_cache.cpp

	#include "unicode/utypes.h"

	#if !UCONFIG_NO_BREAK_ITERATION

	#include "unicode/ubrk.h"
	#include "unicode/rbbi.h"

	#include "rbbi_cache.h"

	#include "brkeng.h"
	#include "cmemory.h"
	#include "rbbidata.h"
	#include "rbbirb.h"
	#include "uassert.h"
	#include "uvectr32.h"

	U_NAMESPACE_BEGIN

	/*
	* DictionaryCache implementation
	*/

	RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
	fBI(bi), fBreaks(status), fPositionInCache(-1),
	fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
	}

	RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
	}

	void RuleBasedBreakIterator::DictionaryCache::reset() {
	fPositionInCache = -1;
	fStart = 0;
	fLimit = 0;
	fFirstRuleStatusIndex = 0;
	fOtherRuleStatusIndex = 0;
	fBreaks.removeAllElements();
	}

	UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t result, int32_t statusIndex) {
	if (fromPos >= fLimit \|\| fromPos < fStart) {
	fPositionInCache = -1;
	return FALSE;
	}

	// Sequential iteration, move from previous boundary to the following

	int32_t r = 0;
	if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
	++fPositionInCache;
	if (fPositionInCache >= fBreaks.size()) {
	fPositionInCache = -1;
	return FALSE;
	}
	r = fBreaks.elementAti(fPositionInCache);
	U_ASSERT(r > fromPos);
	*result = r;
	*statusIndex = fOtherRuleStatusIndex;
	return TRUE;
	}

	// Random indexing. Linear search for the boundary following the given position.

	for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
	r= fBreaks.elementAti(fPositionInCache);
	if (r > fromPos) {
	*result = r;
	*statusIndex = fOtherRuleStatusIndex;
	return TRUE;
	}
	}
	UPRV_UNREACHABLE;
	}


	UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t result, int32_t statusIndex) {
	if (fromPos <= fStart \|\| fromPos > fLimit) {
	fPositionInCache = -1;
	return FALSE;
	}

	if (fromPos == fLimit) {
	fPositionInCache = fBreaks.size() - 1;
	if (fPositionInCache >= 0) {
	U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos);
	}
	}

	int32_t r;
	if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
	--fPositionInCache;
	r = fBreaks.elementAti(fPositionInCache);
	U_ASSERT(r < fromPos);
	*result = r;
	*statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
	return TRUE;
	}

	if (fPositionInCache == 0) {
	fPositionInCache = -1;
	return FALSE;
	}

	for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
	r = fBreaks.elementAti(fPositionInCache);
	if (r < fromPos) {
	*result = r;
	*statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
	return TRUE;
	}
	}
	UPRV_UNREACHABLE;
	}

	void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,
	int32_t firstRuleStatus, int32_t otherRuleStatus) {
	if ((endPos - startPos) <= 1) {
	return;
	}

	reset();
	fFirstRuleStatusIndex = firstRuleStatus;
	fOtherRuleStatusIndex = otherRuleStatus;

	int32_t rangeStart = startPos;
	int32_t rangeEnd = endPos;

	uint16_t category;
	int32_t current;
	UErrorCode status = U_ZERO_ERROR;
	int32_t foundBreakCount = 0;
	UText *text = &fBI->fText;

	// Loop through the text, looking for ranges of dictionary characters.
	// For each span, find the appropriate break engine, and ask it to find
	// any breaks within the span.

	utext_setNativeIndex(text, rangeStart);
	UChar32 c = utext_current32(text);
	category = ucptrie_get(fBI->fData->fTrie, c);
	uint32_t dictStart = fBI->fData->fForwardTable->fDictCategoriesStart;

	while(U_SUCCESS(status)) {
	while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd
	&& (category < dictStart)) {
	utext_next32(text); // TODO: cleaner loop structure.
	c = utext_current32(text);
	category = ucptrie_get(fBI->fData->fTrie, c);
	}
	if (current >= rangeEnd) {
	break;
	}

	// We now have a dictionary character. Get the appropriate language object
	// to deal with it.
	const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c);

	// Ask the language object if there are any breaks. It will add them to the cache and
	// leave the text pointer on the other side of its range, ready to search for the next one.
	if (lbe != NULL) {
	foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks);
	}

	// Reload the loop variables for the next go-round
	c = utext_current32(text);
	category = ucptrie_get(fBI->fData->fTrie, c);
	}

	// If we found breaks, ensure that the first and last entries are
	// the original starting and ending position. And initialize the
	// cache iteration position to the first entry.

	// printf("foundBreakCount = %d\n", foundBreakCount);
	if (foundBreakCount > 0) {
	U_ASSERT(foundBreakCount == fBreaks.size());
	if (startPos < fBreaks.elementAti(0)) {
	// The dictionary did not place a boundary at the start of the segment of text.
	// Add one now. This should not commonly happen, but it would be easy for interactions
	// of the rules for dictionary segments and the break engine implementations to
	// inadvertently cause it. Cover it here, just in case.
	fBreaks.insertElementAt(startPos, 0, status);
	}
	if (endPos > fBreaks.peeki()) {
	fBreaks.push(endPos, status);
	}
	fPositionInCache = 0;
	// Note: Dictionary matching may extend beyond the original limit.
	fStart = fBreaks.elementAti(0);
	fLimit = fBreaks.peeki();
	} else {
	// there were no language-based breaks, even though the segment contained
	// dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
	// for this range will fail, and the calling code will fall back to the rule based boundaries.
	}
	}


	/*
	* BreakCache implemetation
	*/

	RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
	fBI(bi), fSideBuffer(status) {
	reset();
	}


	RuleBasedBreakIterator::BreakCache::~BreakCache() {
	}


	void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) {
	fStartBufIdx = 0;
	fEndBufIdx = 0;
	fTextIdx = pos;
	fBufIdx = 0;
	fBoundaries[0] = pos;
	fStatuses[0] = (uint16_t)ruleStatus;
	}


	int32_t RuleBasedBreakIterator::BreakCache::current() {
	fBI->fPosition = fTextIdx;
	fBI->fRuleStatusIndex = fStatuses[fBufIdx];
	fBI->fDone = FALSE;
	return fTextIdx;
	}


	void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) {
	if (U_FAILURE(status)) {
	return;
	}
	if (startPos == fTextIdx \|\| seek(startPos) \|\| populateNear(startPos, status)) {
	// startPos is in the cache. Do a next() from that position.
	// TODO: an awkward set of interactions with bi->fDone
	// seek() does not clear it; it can't because of interactions with populateNear().
	// next() does not clear it in the fast-path case, where everything matters. Maybe it should.
	// So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
	fBI->fDone = false;
	next();
	}
	return;
	}


	void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) {
	if (U_FAILURE(status)) {
	return;
	}
	if (startPos == fTextIdx \|\| seek(startPos) \|\| populateNear(startPos, status)) {
	if (startPos == fTextIdx) {
	previous(status);
	} else {
	// seek() leaves the BreakCache positioned at the preceding boundary
	// if the requested position is between two boundaries.
	// current() pushes the BreakCache position out to the BreakIterator itself.
	U_ASSERT(startPos > fTextIdx);
	current();
	}
	}
	return;
	}


	/*
	* Out-of-line code for BreakCache::next().
	* Cache does not already contain the boundary
	*/
	void RuleBasedBreakIterator::BreakCache::nextOL() {
	fBI->fDone = !populateFollowing();
	fBI->fPosition = fTextIdx;
	fBI->fRuleStatusIndex = fStatuses[fBufIdx];
	return;
	}


	void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) {
	if (U_FAILURE(status)) {
	return;
	}
	int32_t initialBufIdx = fBufIdx;
	if (fBufIdx == fStartBufIdx) {
	// At start of cache. Prepend to it.
	populatePreceding(status);
	} else {
	// Cache already holds the next boundary
	fBufIdx = modChunkSize(fBufIdx - 1);
	fTextIdx = fBoundaries[fBufIdx];
	}
	fBI->fDone = (fBufIdx == initialBufIdx);
	fBI->fPosition = fTextIdx;
	fBI->fRuleStatusIndex = fStatuses[fBufIdx];
	return;
	}


	UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
	if (pos < fBoundaries[fStartBufIdx] \|\| pos > fBoundaries[fEndBufIdx]) {
	return FALSE;
	}
	if (pos == fBoundaries[fStartBufIdx]) {
	// Common case: seek(0), from BreakIterator::first()
	fBufIdx = fStartBufIdx;
	fTextIdx = fBoundaries[fBufIdx];
	return TRUE;
	}
	if (pos == fBoundaries[fEndBufIdx]) {
	fBufIdx = fEndBufIdx;
	fTextIdx = fBoundaries[fBufIdx];
	return TRUE;
	}

	int32_t min = fStartBufIdx;
	int32_t max = fEndBufIdx;
	while (min != max) {
	int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
	probe = modChunkSize(probe);
	if (fBoundaries[probe] > pos) {
	max = probe;
	} else {
	min = modChunkSize(probe + 1);
	}
	}
	U_ASSERT(fBoundaries[max] > pos);
	fBufIdx = modChunkSize(max - 1);
	fTextIdx = fBoundaries[fBufIdx];
	U_ASSERT(fTextIdx <= pos);
	return TRUE;
	}


	UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) {
	if (U_FAILURE(status)) {
	return FALSE;
	}
	U_ASSERT(position < fBoundaries[fStartBufIdx] \|\| position > fBoundaries[fEndBufIdx]);

	// Find a boundary somewhere in the vicinity of the requested position.
	// Depending on the safe rules and the text data, it could be either before, at, or after
	// the requested position.


	// If the requested position is not near already cached positions, clear the existing cache,
	// find a near-by boundary and begin new cache contents there.

	if ((position < fBoundaries[fStartBufIdx] - 15) \|\| position > (fBoundaries[fEndBufIdx] + 15)) {
	int32_t aBoundary = 0;
	int32_t ruleStatusIndex = 0;
	if (position > 20) {
	int32_t backupPos = fBI->handleSafePrevious(position);

	if (backupPos > 0) {
	// Advance to the boundary following the backup position.
	// There is a complication: the safe reverse rules identify pairs of code points
	// that are safe. If advancing from the safe point moves forwards by less than
	// two code points, we need to advance one more time to ensure that the boundary
	// is good, including a correct rules status value.
	//
	fBI->fPosition = backupPos;
	aBoundary = fBI->handleNext();
	if (aBoundary <= backupPos + 4) {
	// +4 is a quick test for possibly having advanced only one codepoint.
	// Four being the length of the longest potential code point, a supplementary in UTF-8
	utext_setNativeIndex(&fBI->fText, aBoundary);
	if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
	// The initial handleNext() only advanced by a single code point. Go again.
	aBoundary = fBI->handleNext(); // Safe rules identify safe pairs.
	}
	}
	ruleStatusIndex = fBI->fRuleStatusIndex;
	}
	}
	reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
	}

	// Fill in boundaries between existing cache content and the new requested position.

	if (fBoundaries[fEndBufIdx] < position) {
	// The last position in the cache precedes the requested position.
	// Add following position(s) to the cache.
	while (fBoundaries[fEndBufIdx] < position) {
	if (!populateFollowing()) {
	UPRV_UNREACHABLE;
	}
	}
	fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer.
	fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries.
	while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos.
	previous(status);
	}
	return true;
	}

	if (fBoundaries[fStartBufIdx] > position) {
	// The first position in the cache is beyond the requested position.
	// back up more until we get a boundary <= the requested position.
	while (fBoundaries[fStartBufIdx] > position) {
	populatePreceding(status);
	}
	fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer.
	fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries.
	while (fTextIdx < position) { // Move forwards to a position at or following the requested pos.
	next();
	}
	if (fTextIdx > position) {
	// If position is not itself a boundary, the next() loop above will overshoot.
	// Back up one, leaving cache position at the boundary preceding the requested position.
	previous(status);
	}
	return true;
	}

	U_ASSERT(fTextIdx == position);
	return true;
	}



	UBool RuleBasedBreakIterator::BreakCache::populateFollowing() {
	int32_t fromPosition = fBoundaries[fEndBufIdx];
	int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx];
	int32_t pos = 0;
	int32_t ruleStatusIdx = 0;

	if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
	addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
	return TRUE;
	}

	fBI->fPosition = fromPosition;
	pos = fBI->handleNext();
	if (pos == UBRK_DONE) {
	return FALSE;
	}

	ruleStatusIdx = fBI->fRuleStatusIndex;
	if (fBI->fDictionaryCharCount > 0) {
	// The text segment obtained from the rules includes dictionary characters.
	// Subdivide it, with subdivided results going into the dictionary cache.
	fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
	if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
	addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
	return TRUE;
	// TODO: may want to move a sizable chunk of dictionary cache to break cache at this point.
	// But be careful with interactions with populateNear().
	}
	}

	// Rule based segment did not include dictionary characters.
	// Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
	// meaning that we didn't take the return, above.
	// Add its end point to the cache.
	addFollowing(pos, ruleStatusIdx, UpdateCachePosition);

	// Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
	// (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
	//
	for (int count=0; count<6; ++count) {
	pos = fBI->handleNext();
	if (pos == UBRK_DONE \|\| fBI->fDictionaryCharCount > 0) {
	break;
	}
	addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition);
	}

	return TRUE;
	}


	UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) {
	if (U_FAILURE(status)) {
	return FALSE;
	}

	int32_t fromPosition = fBoundaries[fStartBufIdx];
	if (fromPosition == 0) {
	return FALSE;
	}

	int32_t position = 0;
	int32_t positionStatusIdx = 0;

	if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) {
	addPreceding(position, positionStatusIdx, UpdateCachePosition);
	return TRUE;
	}

	int32_t backupPosition = fromPosition;

	// Find a boundary somewhere preceding the first already-cached boundary
	do {
	backupPosition = backupPosition - 30;
	if (backupPosition <= 0) {
	backupPosition = 0;
	} else {
	backupPosition = fBI->handleSafePrevious(backupPosition);
	}
	if (backupPosition == UBRK_DONE \|\| backupPosition == 0) {
	position = 0;
	positionStatusIdx = 0;
	} else {
	// Advance to the boundary following the backup position.
	// There is a complication: the safe reverse rules identify pairs of code points
	// that are safe. If advancing from the safe point moves forwards by less than
	// two code points, we need to advance one more time to ensure that the boundary
	// is good, including a correct rules status value.
	//
	fBI->fPosition = backupPosition;
	position = fBI->handleNext();
	if (position <= backupPosition + 4) {
	// +4 is a quick test for possibly having advanced only one codepoint.
	// Four being the length of the longest potential code point, a supplementary in UTF-8
	utext_setNativeIndex(&fBI->fText, position);
	if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) {
	// The initial handleNext() only advanced by a single code point. Go again.
	position = fBI->handleNext(); // Safe rules identify safe pairs.
	}
	}
	positionStatusIdx = fBI->fRuleStatusIndex;
	}
	} while (position >= fromPosition);

	// Find boundaries between the one we just located and the first already-cached boundary
	// Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..

	fSideBuffer.removeAllElements();
	fSideBuffer.addElement(position, status);
	fSideBuffer.addElement(positionStatusIdx, status);

	do {
	int32_t prevPosition = fBI->fPosition = position;
	int32_t prevStatusIdx = positionStatusIdx;
	position = fBI->handleNext();
	positionStatusIdx = fBI->fRuleStatusIndex;
	if (position == UBRK_DONE) {
	break;
	}

	UBool segmentHandledByDictionary = FALSE;
	if (fBI->fDictionaryCharCount != 0) {
	// Segment from the rules includes dictionary characters.
	// Subdivide it, with subdivided results going into the dictionary cache.
	int32_t dictSegEndPosition = position;
	fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
	while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) {
	segmentHandledByDictionary = true;
	U_ASSERT(position > prevPosition);
	if (position >= fromPosition) {
	break;
	}
	U_ASSERT(position <= dictSegEndPosition);
	fSideBuffer.addElement(position, status);
	fSideBuffer.addElement(positionStatusIdx, status);
	prevPosition = position;
	}
	U_ASSERT(position==dictSegEndPosition \|\| position>=fromPosition);
	}

	if (!segmentHandledByDictionary && position < fromPosition) {
	fSideBuffer.addElement(position, status);
	fSideBuffer.addElement(positionStatusIdx, status);
	}
	} while (position < fromPosition);

	// Move boundaries from the side buffer to the main circular buffer.
	UBool success = FALSE;
	if (!fSideBuffer.isEmpty()) {
	positionStatusIdx = fSideBuffer.popi();
	position = fSideBuffer.popi();
	addPreceding(position, positionStatusIdx, UpdateCachePosition);
	success = TRUE;
	}

	while (!fSideBuffer.isEmpty()) {
	positionStatusIdx = fSideBuffer.popi();
	position = fSideBuffer.popi();
	if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
	// No space in circular buffer to hold a new preceding result while
	// also retaining the current cache (iteration) position.
	// Bailing out is safe; the cache will refill again if needed.
	break;
	}
	}

	return success;
	}


	void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
	U_ASSERT(position > fBoundaries[fEndBufIdx]);
	U_ASSERT(ruleStatusIdx <= UINT16_MAX);
	int32_t nextIdx = modChunkSize(fEndBufIdx + 1);
	if (nextIdx == fStartBufIdx) {
	fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1.
	}
	fBoundaries[nextIdx] = position;
	fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);
	fEndBufIdx = nextIdx;
	if (update == UpdateCachePosition) {
	// Set current position to the newly added boundary.
	fBufIdx = nextIdx;
	fTextIdx = position;
	} else {
	// Retaining the original cache position.
	// Check if the added boundary wraps around the buffer, and would over-write the original position.
	// It's the responsibility of callers of this function to not add too many.
	U_ASSERT(nextIdx != fBufIdx);
	}
	}

	bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
	U_ASSERT(position < fBoundaries[fStartBufIdx]);
	U_ASSERT(ruleStatusIdx <= UINT16_MAX);
	int32_t nextIdx = modChunkSize(fStartBufIdx - 1);
	if (nextIdx == fEndBufIdx) {
	if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
	// Failure. The insertion of the new boundary would claim the buffer position that is the
	// current iteration position. And we also want to retain the current iteration position.
	// (The buffer is already completely full of entries that precede the iteration position.)
	return false;
	}
	fEndBufIdx = modChunkSize(fEndBufIdx - 1);
	}
	fBoundaries[nextIdx] = position;
	fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);
	fStartBufIdx = nextIdx;
	if (update == UpdateCachePosition) {
	fBufIdx = nextIdx;
	fTextIdx = position;
	}
	return true;
	}


	void RuleBasedBreakIterator::BreakCache::dumpCache() {
	#ifdef RBBI_DEBUG
	RBBIDebugPrintf("fTextIdx:%d fBufIdx:%d\n", fTextIdx, fBufIdx);
	for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) {
	RBBIDebugPrintf("%d %d\n", i, fBoundaries[i]);
	if (i == fEndBufIdx) {
	break;
	}
	}
	#endif
	}

	U_NAMESPACE_END

	#endif // #if !UCONFIG_NO_BREAK_ITERATION