source/common/dbbi.cpp - external/github.com/unicode-org/icu - Git at Google

 /*
 **********************************************************************
 *   Copyright (C) 1999-2001 IBM Corp. All rights reserved.
 **********************************************************************
 *   Date        Name        Description
 *   12/1/99    rgillam     Complete port from Java.
 *   01/13/2000 helena      Added UErrorCode to ctors.
 **********************************************************************
 */

 #include "unicode/utypes.h"

 #if !UCONFIG_NO_BREAK_ITERATION

 #include "unicode/dbbi.h"
 #include "unicode/schriter.h"
 #include "dbbi_tbl.h"
 #include "uvector.h"
 #include "cmemory.h"

 U_NAMESPACE_BEGIN

 const char DictionaryBasedBreakIterator::fgClassID = 0;


 //-------------------------------------------------------------------------------
 //
 // constructors
 //
 //-------------------------------------------------------------------------------

 DictionaryBasedBreakIterator::DictionaryBasedBreakIterator() :
 RuleBasedBreakIterator() {
     init();
 }


 DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(UDataMemory* rbbiData,
                                                            const char* dictionaryFilename,
                                                            UErrorCode& status)
 : RuleBasedBreakIterator(rbbiData, status)
 {
     init();
     if (U_FAILURE(status)) {return;};
     fTables = new DictionaryBasedBreakIteratorTables(dictionaryFilename, status);
     /* test for NULL */
     if(fTables == 0) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return;
     }

     if (U_FAILURE(status)) {
         fTables->removeReference();
         fTables = NULL;
         return;
     }
 }


 DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(const DictionaryBasedBreakIterator &other) :
 RuleBasedBreakIterator(other)
 {
     init();
     if (other.fTables != NULL) {
         fTables = other.fTables;
         fTables->addReference();
     }
 }


 //-------------------------------------------------------------------------------
 //
 //   Destructor
 //
 //-------------------------------------------------------------------------------
 DictionaryBasedBreakIterator::~DictionaryBasedBreakIterator()
 {
     uprv_free(cachedBreakPositions);
     cachedBreakPositions = NULL;
     if (fTables != NULL) {fTables->removeReference();};
 }

 //-------------------------------------------------------------------------------
 //
 //   Assignment operator.     Sets this iterator to have the same behavior,
 //                            and iterate over the same text, as the one passed in.
 //
 //-------------------------------------------------------------------------------
 DictionaryBasedBreakIterator&
 DictionaryBasedBreakIterator::operator=(const DictionaryBasedBreakIterator& that) {
     if (this == &that) {
         return *this;
     }
     reset();      // clears out cached break positions.
     RuleBasedBreakIterator::operator=(that);
     if (this->fTables != that.fTables) {
         if (this->fTables != NULL) {this->fTables->removeReference();};
         this->fTables = that.fTables;
         if (this->fTables != NULL) {this->fTables->addReference();};
     }
     return *this;
 }

 //-------------------------------------------------------------------------------
 //
 //   Clone()    Returns a newly-constructed RuleBasedBreakIterator with the same
 //              behavior, and iterating over the same text, as this one.
 //
 //-------------------------------------------------------------------------------
 BreakIterator*
 DictionaryBasedBreakIterator::clone() const {
     return new DictionaryBasedBreakIterator(*this);
 }

 //=======================================================================
 // BreakIterator overrides
 //=======================================================================

 /**
  * Advances the iterator one step backwards.
  * @return The position of the last boundary position before the
  * current iteration position
  */
 int32_t
 DictionaryBasedBreakIterator::previous()
 {
     // if we have cached break positions and we're still in the range
     // covered by them, just move one step backward in the cache
     if (cachedBreakPositions != NULL && positionInCache > 0) {
         --positionInCache;
         fText->setIndex(cachedBreakPositions[positionInCache]);
         return cachedBreakPositions[positionInCache];
     }

     // otherwise, dump the cache and use the inherited previous() method to move
     // backward.  This may fill up the cache with new break positions, in which
     // case we have to mark our position in the cache
     else {
         reset();
         int32_t result = RuleBasedBreakIterator::previous();
         if (cachedBreakPositions != NULL) {
             positionInCache = numCachedBreakPositions - 2;
         }
         return result;
     }
 }

 /**
  * Sets the current iteration position to the last boundary position
  * before the specified position.
  * @param offset The position to begin searching from
  * @return The position of the last boundary before "offset"
  */
 int32_t
 DictionaryBasedBreakIterator::preceding(int32_t offset)
 {
     // if the offset passed in is already past the end of the text,
     // just return DONE; if it's before the beginning, return the
     // text's starting offset
     if (fText == NULL || offset > fText->endIndex()) {
         return BreakIterator::DONE;
     }
     else if (offset < fText->startIndex()) {
         return fText->startIndex();
     }

     // if we have no cached break positions, or "offset" is outside the
     // range covered by the cache, we can just call the inherited routine
     // (which will eventually call other routines in this class that may
     // refresh the cache)
     if (cachedBreakPositions == NULL || offset <= cachedBreakPositions[0] ||
             offset > cachedBreakPositions[numCachedBreakPositions - 1]) {
         reset();
         return RuleBasedBreakIterator::preceding(offset);
     }

     // on the other hand, if "offset" is within the range covered by the cache,
     // then all we have to do is search the cache for the last break position
     // before "offset"
     else {
         positionInCache = 0;
         while (positionInCache < numCachedBreakPositions
                && offset > cachedBreakPositions[positionInCache])
             ++positionInCache;
         --positionInCache;
         fText->setIndex(cachedBreakPositions[positionInCache]);
         return fText->getIndex();
     }
 }

 /**
  * Sets the current iteration position to the first boundary position after
  * the specified position.
  * @param offset The position to begin searching forward from
  * @return The position of the first boundary after "offset"
  */
 int32_t
 DictionaryBasedBreakIterator::following(int32_t offset)
 {
     // if the offset passed in is already past the end of the text,
     // just return DONE; if it's before the beginning, return the
     // text's starting offset
     if (fText == NULL || offset > fText->endIndex()) {
         return BreakIterator::DONE;
     }
     else if (offset < fText->startIndex()) {
         return fText->startIndex();
     }

     // if we have no cached break positions, or if "offset" is outside the
     // range covered by the cache, then dump the cache and call our
     // inherited following() method.  This will call other methods in this
     // class that may refresh the cache.
     if (cachedBreakPositions == NULL || offset < cachedBreakPositions[0] ||
             offset >= cachedBreakPositions[numCachedBreakPositions - 1]) {
         reset();
         return RuleBasedBreakIterator::following(offset);
     }

     // on the other hand, if "offset" is within the range covered by the
     // cache, then just search the cache for the first break position
     // after "offset"
     else {
         positionInCache = 0;
         while (positionInCache < numCachedBreakPositions
                && offset >= cachedBreakPositions[positionInCache])
             ++positionInCache;
         fText->setIndex(cachedBreakPositions[positionInCache]);
         return fText->getIndex();
     }
 }

 /**
  * This is the implementation function for next().
  */
 int32_t
 DictionaryBasedBreakIterator::handleNext()
 {
     UErrorCode status = U_ZERO_ERROR;
     // if there are no cached break positions, or if we've just moved
     // off the end of the range covered by the cache, we have to dump
     // and possibly regenerate the cache
     if (cachedBreakPositions == NULL || positionInCache == numCachedBreakPositions - 1) {

         // start by using the inherited handleNext() to find a tentative return
         // value.   dictionaryCharCount tells us how many dictionary characters
         // we passed over on our way to the tentative return value
         int32_t startPos = fText->getIndex();
         fDictionaryCharCount = 0;
         int32_t result = RuleBasedBreakIterator::handleNext();

         // if we passed over more than one dictionary character, then we use
         // divideUpDictionaryRange() to regenerate the cached break positions
         // for the new range
         if (fDictionaryCharCount > 1 && result - startPos > 1) {
             divideUpDictionaryRange(startPos, result, status);
             if (U_FAILURE(status)) {
                 return -9999;   // SHOULD NEVER GET HERE!
             }
         }

         // otherwise, the value we got back from the inherited fuction
         // is our return value, and we can dump the cache
         else {
             reset();
             return result;
         }
     }

     // if the cache of break positions has been regenerated (or existed all
     // along), then just advance to the next break position in the cache
     // and return it
     if (cachedBreakPositions != NULL) {
         ++positionInCache;
         fText->setIndex(cachedBreakPositions[positionInCache]);
         return cachedBreakPositions[positionInCache];
     }
     return -9999;   // SHOULD NEVER GET HERE!
 }

 void
 DictionaryBasedBreakIterator::reset()
 {
     uprv_free(cachedBreakPositions);
     cachedBreakPositions = NULL;
     numCachedBreakPositions = 0;
     fDictionaryCharCount = 0;
     positionInCache = 0;
 }


 //-------------------------------------------------------------------------------
 //
 //    init()    Common initialization routine, for use by constructors, etc.
 //
 //-------------------------------------------------------------------------------
 void DictionaryBasedBreakIterator::init() {
     cachedBreakPositions    = NULL;
     fTables                 = NULL;
     numCachedBreakPositions = 0;
     fDictionaryCharCount    = 0;
     positionInCache         = 0;
 }


 //-------------------------------------------------------------------------------
 //
 //    BufferClone
 //
 //-------------------------------------------------------------------------------
 BreakIterator *  DictionaryBasedBreakIterator::createBufferClone(void *stackBuffer,
                                    int32_t &bufferSize,
                                    UErrorCode &status)
 {
     if (U_FAILURE(status)){
         return NULL;
     }

     //
     //  If user buffer size is zero this is a preflight operation to
     //    obtain the needed buffer size, allowing for worst case misalignment.
     //
     if (bufferSize == 0) {
         bufferSize = sizeof(DictionaryBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
         return NULL;
     }

     //
     //  Check the alignment and size of the user supplied buffer.
     //  Allocate heap memory if the user supplied memory is insufficient.
     //
     char     *buf   = (char *)stackBuffer;
     uint32_t s      = bufferSize;

     if (stackBuffer == NULL) {
         s = 0;   // Ignore size, force allocation if user didn't give us a buffer.
     }
     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
         int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(buf);
         s   -= offsetUp;
         buf += offsetUp;
     }
     if (s < sizeof(DictionaryBasedBreakIterator)) {
         buf = (char *) new DictionaryBasedBreakIterator();
         if (buf == 0) {
             status = U_MEMORY_ALLOCATION_ERROR;
             return NULL;
         }
         status = U_SAFECLONE_ALLOCATED_WARNING;
     }

     //
     //  Initialize the clone object.
     //    TODO:  using an overloaded C++ "operator new" to directly initialize the
     //           copy in the user's buffer would be better, but it doesn't seem
     //           to get along with namespaces.  Investigate why.
     //
     //           The memcpy is only safe with an empty (default constructed)
     //           break iterator.  Use on others can screw up reference counts
     //           to data.  memcpy-ing objects is not really a good idea...
     //
     DictionaryBasedBreakIterator localIter;        // Empty break iterator, source for memcpy
     DictionaryBasedBreakIterator *clone = (DictionaryBasedBreakIterator *)buf;
     uprv_memcpy(clone, &localIter, sizeof(DictionaryBasedBreakIterator)); // clone = empty, but initialized, iterator.
     *clone = *this;                               // clone = the real one we want.
     if (status != U_SAFECLONE_ALLOCATED_WARNING) {
         clone->fBufferClone = TRUE;
     }
     return clone;
 }


 /**
  * This is the function that actually implements the dictionary-based
  * algorithm.  Given the endpoints of a range of text, it uses the
  * dictionary to determine the positions of any boundaries in this
  * range.  It stores all the boundary positions it discovers in
  * cachedBreakPositions so that we only have to do this work once
  * for each time we enter the range.
  */
 void
 DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t endPos, UErrorCode &status)
 {
     // the range we're dividing may begin or end with non-dictionary characters
     // (i.e., for line breaking, we may have leading or trailing punctuation
     // that needs to be kept with the word).  Seek from the beginning of the
     // range to the first dictionary character
     fText->setIndex(startPos);
     UChar c = fText->current();
     while (isDictionaryChar(c) == FALSE) {
         c = fText->next();
     }


     // initialize.  We maintain two stacks: currentBreakPositions contains
     // the list of break positions that will be returned if we successfully
     // finish traversing the whole range now.  possibleBreakPositions lists
     // all other possible word ends we've passed along the way.  (Whenever
     // we reach an error [a sequence of characters that can't begin any word
     // in the dictionary], we back up, possibly delete some breaks from
     // currentBreakPositions, move a break from possibleBreakPositions
     // to currentBreakPositions, and start over from there.  This process
     // continues in this way until we either successfully make it all the way
     // across the range, or exhaust all of our combinations of break
     // positions.) wrongBreakPositions is used to keep track of paths we've
     // tried on previous iterations.  As the iterator backs up further and
     // further, this saves us from having to follow each possible path
     // through the text all the way to the error (hopefully avoiding many
     // future recursive calls as well).
     UStack currentBreakPositions(status);
     UStack possibleBreakPositions(status);
     UVector wrongBreakPositions(status);

     // the dictionary is implemented as a trie, which is treated as a state
     // machine.  -1 represents the end of a legal word.  Every word in the
     // dictionary is represented by a path from the root node to -1.  A path
     // that ends in state 0 is an illegal combination of characters.
     int16_t state = 0;

     // these two variables are used for error handling.  We keep track of the
     // farthest we've gotten through the range being divided, and the combination
     // of breaks that got us that far.  If we use up all possible break
     // combinations, the text contains an error or a word that's not in the
     // dictionary.  In this case, we "bless" the break positions that got us the
     // farthest as real break positions, and then start over from scratch with
     // the character where the error occurred.
     int32_t farthestEndPoint = fText->getIndex();
     UStack bestBreakPositions(status);
     UBool bestBreakPositionsInitialized = FALSE;

     if (U_FAILURE(status)) {
         return;
     }
     // initialize (we always exit the loop with a break statement)
     c = fText->current();
     for (;;) {

         // if we can transition to state "-1" from our current state, we're
         // on the last character of a legal word.  Push that position onto
         // the possible-break-positions stack
         if (fTables->fDictionary->at(state, (int32_t)0) == -1) {
             possibleBreakPositions.push(fText->getIndex(), status);
         }

         // look up the new state to transition to in the dictionary
         state = fTables->fDictionary->at(state, c);

         // if the character we're sitting on causes us to transition to
         // the "end of word" state, then it was a non-dictionary character
         // and we've successfully traversed the whole range.  Drop out
         // of the loop.
         if (state == -1) {
             currentBreakPositions.push(fText->getIndex(), status);
             break;
         }

         // if the character we're sitting on causes us to transition to
         // the error state, or if we've gone off the end of the range
         // without transitioning to the "end of word" state, we've hit
         // an error...
         else if (state == 0 || fText->getIndex() >= endPos) {

             // if this is the farthest we've gotten, take note of it in
             // case there's an error in the text
             if (fText->getIndex() > farthestEndPoint) {
                 farthestEndPoint = fText->getIndex();
                 bestBreakPositions.removeAllElements();
                 bestBreakPositionsInitialized = TRUE;
                 for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
                     bestBreakPositions.push(currentBreakPositions.elementAti(i), status);
                 }
             }

             // wrongBreakPositions is a list of all break positions we've tried starting
             // that didn't allow us to traverse all the way through the text.  Every time
             // we pop a break position off of currentBreakPositions, we put it into
             // wrongBreakPositions to avoid trying it again later.  If we make it to this
             // spot, we're either going to back up to a break in possibleBreakPositions
             // and try starting over from there, or we've exhausted all possible break
             // positions and are going to do the fallback procedure.  This loop prevents
             // us from messing with anything in possibleBreakPositions that didn't work as
             // a starting point the last time we tried it (this is to prevent a bunch of
             // repetitive checks from slowing down some extreme cases)
             while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
                         possibleBreakPositions.peeki())) {
                 possibleBreakPositions.popi();
             }

             // if we've used up all possible break-position combinations, there's
             // an error or an unknown word in the text.  In this case, we start
             // over, treating the farthest character we've reached as the beginning
             // of the range, and "blessing" the break positions that got us that
             // far as real break positions
             if (possibleBreakPositions.isEmpty()) {
                 if (bestBreakPositionsInitialized) {
                     currentBreakPositions.removeAllElements();
                     for (int32_t i = 0; i < bestBreakPositions.size(); i++) {
                         currentBreakPositions.push(bestBreakPositions.elementAti(i), status);
                     }
                     bestBreakPositions.removeAllElements();
                     if (farthestEndPoint < endPos) {
                         fText->setIndex(farthestEndPoint + 1);
                     }
                     else {
                         break;
                     }
                 }
                 else {
                     if ((currentBreakPositions.isEmpty()
                             || currentBreakPositions.peeki() != fText->getIndex())
                             && fText->getIndex() != startPos) {
                         currentBreakPositions.push(fText->getIndex(), status);
                     }
                     fText->next();
                     currentBreakPositions.push(fText->getIndex(), status);
                 }
             }

             // if we still have more break positions we can try, then promote the
             // last break in possibleBreakPositions into currentBreakPositions,
             // and get rid of all entries in currentBreakPositions that come after
             // it.  Then back up to that position and start over from there (i.e.,
             // treat that position as the beginning of a new word)
             else {
                 int32_t temp = possibleBreakPositions.popi();
                 int32_t temp2 = 0;
                 while (!currentBreakPositions.isEmpty() && temp <
                        currentBreakPositions.peeki()) {
                     temp2 = currentBreakPositions.popi();
                     wrongBreakPositions.addElement(temp2, status);
                 }
                 currentBreakPositions.push(temp, status);
                 fText->setIndex(currentBreakPositions.peeki());
             }

             // re-sync "c" for the next go-round, and drop out of the loop if
             // we've made it off the end of the range
             c = fText->current();
             if (fText->getIndex() >= endPos) {
                 break;
             }
         }

         // if we didn't hit any exceptional conditions on this last iteration,
         // just advance to the next character and loop
         else {
             c = fText->next();
         }
     }

     // dump the last break position in the list, and replace it with the actual
     // end of the range (which may be the same character, or may be further on
     // because the range actually ended with non-dictionary characters we want to
     // keep with the word)
     if (!currentBreakPositions.isEmpty()) {
         currentBreakPositions.popi();
     }
     currentBreakPositions.push(endPos, status);

     // create a regular array to hold the break positions and copy
     // the break positions from the stack to the array (in addition,
     // our starting position goes into this array as a break position).
     // This array becomes the cache of break positions used by next()
     // and previous(), so this is where we actually refresh the cache.
     if (cachedBreakPositions != NULL) {
         uprv_free(cachedBreakPositions);
     }
     cachedBreakPositions = (int32_t *)uprv_malloc((currentBreakPositions.size() + 1) * sizeof(int32_t));
     /* Test for NULL */
     if(cachedBreakPositions == NULL) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return;
     }
     numCachedBreakPositions = currentBreakPositions.size() + 1;
     cachedBreakPositions[0] = startPos;

     for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
         cachedBreakPositions[i + 1] = currentBreakPositions.elementAti(i);
     }
     positionInCache = 0;
 }

 U_NAMESPACE_END

 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

 /* eof */
	/*
	**********************************************************************
	* Copyright (C) 1999-2001 IBM Corp. All rights reserved.
	**********************************************************************
	* Date Name Description
	* 12/1/99 rgillam Complete port from Java.
	* 01/13/2000 helena Added UErrorCode to ctors.
	**********************************************************************
	*/

	#include "unicode/utypes.h"

	#if !UCONFIG_NO_BREAK_ITERATION

	#include "unicode/dbbi.h"
	#include "unicode/schriter.h"
	#include "dbbi_tbl.h"
	#include "uvector.h"
	#include "cmemory.h"

	U_NAMESPACE_BEGIN

	const char DictionaryBasedBreakIterator::fgClassID = 0;


	//-------------------------------------------------------------------------------
	//
	// constructors
	//
	//-------------------------------------------------------------------------------

	DictionaryBasedBreakIterator::DictionaryBasedBreakIterator() :
	RuleBasedBreakIterator() {
	init();
	}


	DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(UDataMemory* rbbiData,
	const char* dictionaryFilename,
	UErrorCode& status)
	: RuleBasedBreakIterator(rbbiData, status)
	{
	init();
	if (U_FAILURE(status)) {return;};
	fTables = new DictionaryBasedBreakIteratorTables(dictionaryFilename, status);
	/* test for NULL */
	if(fTables == 0) {
	status = U_MEMORY_ALLOCATION_ERROR;
	return;
	}

	if (U_FAILURE(status)) {
	fTables->removeReference();
	fTables = NULL;
	return;
	}
	}


	DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(const DictionaryBasedBreakIterator &other) :
	RuleBasedBreakIterator(other)
	{
	init();
	if (other.fTables != NULL) {
	fTables = other.fTables;
	fTables->addReference();
	}
	}




	//-------------------------------------------------------------------------------
	//
	// Destructor
	//
	//-------------------------------------------------------------------------------
	DictionaryBasedBreakIterator::~DictionaryBasedBreakIterator()
	{
	uprv_free(cachedBreakPositions);
	cachedBreakPositions = NULL;
	if (fTables != NULL) {fTables->removeReference();};
	}

	//-------------------------------------------------------------------------------
	//
	// Assignment operator. Sets this iterator to have the same behavior,
	// and iterate over the same text, as the one passed in.
	//
	//-------------------------------------------------------------------------------
	DictionaryBasedBreakIterator&
	DictionaryBasedBreakIterator::operator=(const DictionaryBasedBreakIterator& that) {
	if (this == &that) {
	return *this;
	}
	reset(); // clears out cached break positions.
	RuleBasedBreakIterator::operator=(that);
	if (this->fTables != that.fTables) {
	if (this->fTables != NULL) {this->fTables->removeReference();};
	this->fTables = that.fTables;
	if (this->fTables != NULL) {this->fTables->addReference();};
	}
	return *this;
	}

	//-------------------------------------------------------------------------------
	//
	// Clone() Returns a newly-constructed RuleBasedBreakIterator with the same
	// behavior, and iterating over the same text, as this one.
	//
	//-------------------------------------------------------------------------------
	BreakIterator*
	DictionaryBasedBreakIterator::clone() const {
	return new DictionaryBasedBreakIterator(*this);
	}

	//=======================================================================
	// BreakIterator overrides
	//=======================================================================

	/**
	* Advances the iterator one step backwards.
	* @return The position of the last boundary position before the
	* current iteration position
	*/
	int32_t
	DictionaryBasedBreakIterator::previous()
	{
	// if we have cached break positions and we're still in the range
	// covered by them, just move one step backward in the cache
	if (cachedBreakPositions != NULL && positionInCache > 0) {
	--positionInCache;
	fText->setIndex(cachedBreakPositions[positionInCache]);
	return cachedBreakPositions[positionInCache];
	}

	// otherwise, dump the cache and use the inherited previous() method to move
	// backward. This may fill up the cache with new break positions, in which
	// case we have to mark our position in the cache
	else {
	reset();
	int32_t result = RuleBasedBreakIterator::previous();
	if (cachedBreakPositions != NULL) {
	positionInCache = numCachedBreakPositions - 2;
	}
	return result;
	}
	}

	/**
	* Sets the current iteration position to the last boundary position
	* before the specified position.
	* @param offset The position to begin searching from
	* @return The position of the last boundary before "offset"
	*/
	int32_t
	DictionaryBasedBreakIterator::preceding(int32_t offset)
	{
	// if the offset passed in is already past the end of the text,
	// just return DONE; if it's before the beginning, return the
	// text's starting offset
	if (fText == NULL \|\| offset > fText->endIndex()) {
	return BreakIterator::DONE;
	}
	else if (offset < fText->startIndex()) {
	return fText->startIndex();
	}

	// if we have no cached break positions, or "offset" is outside the
	// range covered by the cache, we can just call the inherited routine
	// (which will eventually call other routines in this class that may
	// refresh the cache)
	if (cachedBreakPositions == NULL \|\| offset <= cachedBreakPositions[0] \|\|
	offset > cachedBreakPositions[numCachedBreakPositions - 1]) {
	reset();
	return RuleBasedBreakIterator::preceding(offset);
	}

	// on the other hand, if "offset" is within the range covered by the cache,
	// then all we have to do is search the cache for the last break position
	// before "offset"
	else {
	positionInCache = 0;
	while (positionInCache < numCachedBreakPositions
	&& offset > cachedBreakPositions[positionInCache])
	++positionInCache;
	--positionInCache;
	fText->setIndex(cachedBreakPositions[positionInCache]);
	return fText->getIndex();
	}
	}

	/**
	* Sets the current iteration position to the first boundary position after
	* the specified position.
	* @param offset The position to begin searching forward from
	* @return The position of the first boundary after "offset"
	*/
	int32_t
	DictionaryBasedBreakIterator::following(int32_t offset)
	{
	// if the offset passed in is already past the end of the text,
	// just return DONE; if it's before the beginning, return the
	// text's starting offset
	if (fText == NULL \|\| offset > fText->endIndex()) {
	return BreakIterator::DONE;
	}
	else if (offset < fText->startIndex()) {
	return fText->startIndex();
	}

	// if we have no cached break positions, or if "offset" is outside the
	// range covered by the cache, then dump the cache and call our
	// inherited following() method. This will call other methods in this
	// class that may refresh the cache.
	if (cachedBreakPositions == NULL \|\| offset < cachedBreakPositions[0] \|\|
	offset >= cachedBreakPositions[numCachedBreakPositions - 1]) {
	reset();
	return RuleBasedBreakIterator::following(offset);
	}

	// on the other hand, if "offset" is within the range covered by the
	// cache, then just search the cache for the first break position
	// after "offset"
	else {
	positionInCache = 0;
	while (positionInCache < numCachedBreakPositions
	&& offset >= cachedBreakPositions[positionInCache])
	++positionInCache;
	fText->setIndex(cachedBreakPositions[positionInCache]);
	return fText->getIndex();
	}
	}

	/**
	* This is the implementation function for next().
	*/
	int32_t
	DictionaryBasedBreakIterator::handleNext()
	{
	UErrorCode status = U_ZERO_ERROR;
	// if there are no cached break positions, or if we've just moved
	// off the end of the range covered by the cache, we have to dump
	// and possibly regenerate the cache
	if (cachedBreakPositions == NULL \|\| positionInCache == numCachedBreakPositions - 1) {

	// start by using the inherited handleNext() to find a tentative return
	// value. dictionaryCharCount tells us how many dictionary characters
	// we passed over on our way to the tentative return value
	int32_t startPos = fText->getIndex();
	fDictionaryCharCount = 0;
	int32_t result = RuleBasedBreakIterator::handleNext();

	// if we passed over more than one dictionary character, then we use
	// divideUpDictionaryRange() to regenerate the cached break positions
	// for the new range
	if (fDictionaryCharCount > 1 && result - startPos > 1) {
	divideUpDictionaryRange(startPos, result, status);
	if (U_FAILURE(status)) {
	return -9999; // SHOULD NEVER GET HERE!
	}
	}

	// otherwise, the value we got back from the inherited fuction
	// is our return value, and we can dump the cache
	else {
	reset();
	return result;
	}
	}

	// if the cache of break positions has been regenerated (or existed all
	// along), then just advance to the next break position in the cache
	// and return it
	if (cachedBreakPositions != NULL) {
	++positionInCache;
	fText->setIndex(cachedBreakPositions[positionInCache]);
	return cachedBreakPositions[positionInCache];
	}
	return -9999; // SHOULD NEVER GET HERE!
	}

	void
	DictionaryBasedBreakIterator::reset()
	{
	uprv_free(cachedBreakPositions);
	cachedBreakPositions = NULL;
	numCachedBreakPositions = 0;
	fDictionaryCharCount = 0;
	positionInCache = 0;
	}



	//-------------------------------------------------------------------------------
	//
	// init() Common initialization routine, for use by constructors, etc.
	//
	//-------------------------------------------------------------------------------
	void DictionaryBasedBreakIterator::init() {
	cachedBreakPositions = NULL;
	fTables = NULL;
	numCachedBreakPositions = 0;
	fDictionaryCharCount = 0;
	positionInCache = 0;
	}


	//-------------------------------------------------------------------------------
	//
	// BufferClone
	//
	//-------------------------------------------------------------------------------
	BreakIterator * DictionaryBasedBreakIterator::createBufferClone(void *stackBuffer,
	int32_t &bufferSize,
	UErrorCode &status)
	{
	if (U_FAILURE(status)){
	return NULL;
	}

	//
	// If user buffer size is zero this is a preflight operation to
	// obtain the needed buffer size, allowing for worst case misalignment.
	//
	if (bufferSize == 0) {
	bufferSize = sizeof(DictionaryBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
	return NULL;
	}

	//
	// Check the alignment and size of the user supplied buffer.
	// Allocate heap memory if the user supplied memory is insufficient.
	//
	char buf = (char )stackBuffer;
	uint32_t s = bufferSize;

	if (stackBuffer == NULL) {
	s = 0; // Ignore size, force allocation if user didn't give us a buffer.
	}
	if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
	int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(buf);
	s -= offsetUp;
	buf += offsetUp;
	}
	if (s < sizeof(DictionaryBasedBreakIterator)) {
	buf = (char *) new DictionaryBasedBreakIterator();
	if (buf == 0) {
	status = U_MEMORY_ALLOCATION_ERROR;
	return NULL;
	}
	status = U_SAFECLONE_ALLOCATED_WARNING;
	}

	//
	// Initialize the clone object.
	// TODO: using an overloaded C++ "operator new" to directly initialize the
	// copy in the user's buffer would be better, but it doesn't seem
	// to get along with namespaces. Investigate why.
	//
	// The memcpy is only safe with an empty (default constructed)
	// break iterator. Use on others can screw up reference counts
	// to data. memcpy-ing objects is not really a good idea...
	//
	DictionaryBasedBreakIterator localIter; // Empty break iterator, source for memcpy
	DictionaryBasedBreakIterator clone = (DictionaryBasedBreakIterator )buf;
	uprv_memcpy(clone, &localIter, sizeof(DictionaryBasedBreakIterator)); // clone = empty, but initialized, iterator.
	clone = this; // clone = the real one we want.
	if (status != U_SAFECLONE_ALLOCATED_WARNING) {
	clone->fBufferClone = TRUE;
	}
	return clone;
	}




	/**
	* This is the function that actually implements the dictionary-based
	* algorithm. Given the endpoints of a range of text, it uses the
	* dictionary to determine the positions of any boundaries in this
	* range. It stores all the boundary positions it discovers in
	* cachedBreakPositions so that we only have to do this work once
	* for each time we enter the range.
	*/
	void
	DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t endPos, UErrorCode &status)
	{
	// the range we're dividing may begin or end with non-dictionary characters
	// (i.e., for line breaking, we may have leading or trailing punctuation
	// that needs to be kept with the word). Seek from the beginning of the
	// range to the first dictionary character
	fText->setIndex(startPos);
	UChar c = fText->current();
	while (isDictionaryChar(c) == FALSE) {
	c = fText->next();
	}


	// initialize. We maintain two stacks: currentBreakPositions contains
	// the list of break positions that will be returned if we successfully
	// finish traversing the whole range now. possibleBreakPositions lists
	// all other possible word ends we've passed along the way. (Whenever
	// we reach an error [a sequence of characters that can't begin any word
	// in the dictionary], we back up, possibly delete some breaks from
	// currentBreakPositions, move a break from possibleBreakPositions
	// to currentBreakPositions, and start over from there. This process
	// continues in this way until we either successfully make it all the way
	// across the range, or exhaust all of our combinations of break
	// positions.) wrongBreakPositions is used to keep track of paths we've
	// tried on previous iterations. As the iterator backs up further and
	// further, this saves us from having to follow each possible path
	// through the text all the way to the error (hopefully avoiding many
	// future recursive calls as well).
	UStack currentBreakPositions(status);
	UStack possibleBreakPositions(status);
	UVector wrongBreakPositions(status);

	// the dictionary is implemented as a trie, which is treated as a state
	// machine. -1 represents the end of a legal word. Every word in the
	// dictionary is represented by a path from the root node to -1. A path
	// that ends in state 0 is an illegal combination of characters.
	int16_t state = 0;

	// these two variables are used for error handling. We keep track of the
	// farthest we've gotten through the range being divided, and the combination
	// of breaks that got us that far. If we use up all possible break
	// combinations, the text contains an error or a word that's not in the
	// dictionary. In this case, we "bless" the break positions that got us the
	// farthest as real break positions, and then start over from scratch with
	// the character where the error occurred.
	int32_t farthestEndPoint = fText->getIndex();
	UStack bestBreakPositions(status);
	UBool bestBreakPositionsInitialized = FALSE;

	if (U_FAILURE(status)) {
	return;
	}
	// initialize (we always exit the loop with a break statement)
	c = fText->current();
	for (;;) {

	// if we can transition to state "-1" from our current state, we're
	// on the last character of a legal word. Push that position onto
	// the possible-break-positions stack
	if (fTables->fDictionary->at(state, (int32_t)0) == -1) {
	possibleBreakPositions.push(fText->getIndex(), status);
	}

	// look up the new state to transition to in the dictionary
	state = fTables->fDictionary->at(state, c);

	// if the character we're sitting on causes us to transition to
	// the "end of word" state, then it was a non-dictionary character
	// and we've successfully traversed the whole range. Drop out
	// of the loop.
	if (state == -1) {
	currentBreakPositions.push(fText->getIndex(), status);
	break;
	}

	// if the character we're sitting on causes us to transition to
	// the error state, or if we've gone off the end of the range
	// without transitioning to the "end of word" state, we've hit
	// an error...
	else if (state == 0 \|\| fText->getIndex() >= endPos) {

	// if this is the farthest we've gotten, take note of it in
	// case there's an error in the text
	if (fText->getIndex() > farthestEndPoint) {
	farthestEndPoint = fText->getIndex();
	bestBreakPositions.removeAllElements();
	bestBreakPositionsInitialized = TRUE;
	for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
	bestBreakPositions.push(currentBreakPositions.elementAti(i), status);
	}
	}

	// wrongBreakPositions is a list of all break positions we've tried starting
	// that didn't allow us to traverse all the way through the text. Every time
	// we pop a break position off of currentBreakPositions, we put it into
	// wrongBreakPositions to avoid trying it again later. If we make it to this
	// spot, we're either going to back up to a break in possibleBreakPositions
	// and try starting over from there, or we've exhausted all possible break
	// positions and are going to do the fallback procedure. This loop prevents
	// us from messing with anything in possibleBreakPositions that didn't work as
	// a starting point the last time we tried it (this is to prevent a bunch of
	// repetitive checks from slowing down some extreme cases)
	while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
	possibleBreakPositions.peeki())) {
	possibleBreakPositions.popi();
	}

	// if we've used up all possible break-position combinations, there's
	// an error or an unknown word in the text. In this case, we start
	// over, treating the farthest character we've reached as the beginning
	// of the range, and "blessing" the break positions that got us that
	// far as real break positions
	if (possibleBreakPositions.isEmpty()) {
	if (bestBreakPositionsInitialized) {
	currentBreakPositions.removeAllElements();
	for (int32_t i = 0; i < bestBreakPositions.size(); i++) {
	currentBreakPositions.push(bestBreakPositions.elementAti(i), status);
	}
	bestBreakPositions.removeAllElements();
	if (farthestEndPoint < endPos) {
	fText->setIndex(farthestEndPoint + 1);
	}
	else {
	break;
	}
	}
	else {
	if ((currentBreakPositions.isEmpty()
	\|\| currentBreakPositions.peeki() != fText->getIndex())
	&& fText->getIndex() != startPos) {
	currentBreakPositions.push(fText->getIndex(), status);
	}
	fText->next();
	currentBreakPositions.push(fText->getIndex(), status);
	}
	}

	// if we still have more break positions we can try, then promote the
	// last break in possibleBreakPositions into currentBreakPositions,
	// and get rid of all entries in currentBreakPositions that come after
	// it. Then back up to that position and start over from there (i.e.,
	// treat that position as the beginning of a new word)
	else {
	int32_t temp = possibleBreakPositions.popi();
	int32_t temp2 = 0;
	while (!currentBreakPositions.isEmpty() && temp <
	currentBreakPositions.peeki()) {
	temp2 = currentBreakPositions.popi();
	wrongBreakPositions.addElement(temp2, status);
	}
	currentBreakPositions.push(temp, status);
	fText->setIndex(currentBreakPositions.peeki());
	}

	// re-sync "c" for the next go-round, and drop out of the loop if
	// we've made it off the end of the range
	c = fText->current();
	if (fText->getIndex() >= endPos) {
	break;
	}
	}

	// if we didn't hit any exceptional conditions on this last iteration,
	// just advance to the next character and loop
	else {
	c = fText->next();
	}
	}

	// dump the last break position in the list, and replace it with the actual
	// end of the range (which may be the same character, or may be further on
	// because the range actually ended with non-dictionary characters we want to
	// keep with the word)
	if (!currentBreakPositions.isEmpty()) {
	currentBreakPositions.popi();
	}
	currentBreakPositions.push(endPos, status);

	// create a regular array to hold the break positions and copy
	// the break positions from the stack to the array (in addition,
	// our starting position goes into this array as a break position).
	// This array becomes the cache of break positions used by next()
	// and previous(), so this is where we actually refresh the cache.
	if (cachedBreakPositions != NULL) {
	uprv_free(cachedBreakPositions);
	}
	cachedBreakPositions = (int32_t )uprv_malloc((currentBreakPositions.size() + 1) sizeof(int32_t));
	/* Test for NULL */
	if(cachedBreakPositions == NULL) {
	status = U_MEMORY_ALLOCATION_ERROR;
	return;
	}
	numCachedBreakPositions = currentBreakPositions.size() + 1;
	cachedBreakPositions[0] = startPos;

	for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
	cachedBreakPositions[i + 1] = currentBreakPositions.elementAti(i);
	}
	positionInCache = 0;
	}

	U_NAMESPACE_END

	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

	/* eof */