| /* |
| ********************************************************************** |
| * Copyright (C) 1999-2000 IBM and others. All rights reserved. |
| ********************************************************************** |
| * Date Name Description |
| * 03/22/2000 helena Creation. |
| ********************************************************************** |
| */ |
| #ifndef STRSRCH_H |
| #define STRSRCH_H |
| |
| #include "unicode/utypes.h" |
| #include "unicode/unistr.h" |
| #include "unicode/chariter.h" |
| #include "unicode/tblcoll.h" |
| #include "unicode/brkiter.h" |
| #include "srchiter.h" |
| |
| class SearchIterator; |
| /** |
| * <code>StringSearch</code> is a <code>SearchIterator</code> that provides |
| * language-sensitive text searching based on the comparison rules defined |
| * in a {@link RuleBasedCollator} object. |
| * Instances of <code>StringSearch</code> function as iterators |
| * maintain a current position and scan over text returning the index of |
| * characters where the pattern occurs and the length of each match. |
| * <p> |
| * <code>StringSearch</code> uses a version of the fast Boyer-Moore search |
| * algorithm that has been adapted to work with the large character set of |
| * Unicode. See "Efficient Text Searching in Java", to be published in |
| * <i>Java Report</i> in February, 1999, for further information on the algorithm. |
| * <p> |
| * Consult the <code>SearchIterator</code> documentation for information on |
| * and examples of how to use instances of this class to implement text |
| * searching. <code>SearchIterator</code> provides all of the necessary |
| * API; this class only provides constructors and internal implementation |
| * methods. |
| * |
| * @see SearchIterator |
| * @see RuleBasedCollator |
| * |
| * @author Laura Werner |
| * @version 1.0 |
| */ |
| |
| class StringSearch : public SearchIterator |
| { |
| public: |
| /** |
| * Construct a <code>StringSearch</code> object using a specific collator and set |
| * of boundary-detection rules. |
| * <p> |
| * @param pat The text for which this object will search. |
| * |
| * @param target The text in which to search for the pattern. |
| * |
| * @param coll A <code>RuleBasedCollator</code> object which defines the |
| * language-sensitive comparison rules used to determine |
| * whether text in the pattern and target matches. |
| * |
| * @param breaker A <code>BreakIterator</code> object used to constrain the matches |
| * that are found. Matches whose start and end indices |
| * in the target text are not boundaries as determined |
| * by the <code>BreakIterator</code> are ignored. If this behavior |
| * is not desired, <code>null</code> can be passed in instead. |
| */ |
| StringSearch(const UnicodeString& pat, |
| CharacterIterator* target, |
| RuleBasedCollator* coll, |
| BreakIterator* breaker, |
| UErrorCode& status); |
| |
| /** |
| * Construct a <code>StringSearch</code> object using a specific collator. |
| * <p> |
| * @param pattern The text for which this object will search. |
| * |
| * @param target The text in which to search for the pattern. |
| * |
| * @param collator A <code>RuleBasedCollator</code> object which defines the |
| * language-sensitive comparison rules used to determine |
| * whether text in the pattern and target matches. |
| */ |
| StringSearch(const UnicodeString& pattern, |
| CharacterIterator* target, |
| RuleBasedCollator* collator, |
| UErrorCode& status); |
| |
| /** |
| * copy constructor |
| */ |
| StringSearch(const StringSearch& that); |
| |
| /** |
| * Construct a <code>StringSearch</code> object using the collator and |
| * character boundary detection rules for a given locale |
| * <p> |
| * @param pattern The text for which this object will search. |
| * |
| * @param target The text in which to search for the pattern. |
| * |
| * @param loc The locale whose collation and break-detection rules |
| * should be used. |
| * |
| * @exception ClassCastException thrown if the collator for the specified |
| * locale is not a RuleBasedCollator. |
| */ |
| StringSearch(const UnicodeString& pattern, |
| CharacterIterator* target, |
| const Locale& loc, |
| UErrorCode& status); |
| /** |
| * Construct a <code>StringSearch</code> object using the collator for the default |
| * locale |
| * <p> |
| * @param pattern The text for which this object will search. |
| * |
| * @param target The text in which to search for the pattern. |
| * |
| * @param collator A <code>RuleBasedCollator</code> object which defines the |
| * language-sensitive comparison rules used to determine |
| * whether text in the pattern and target matches. |
| */ |
| StringSearch(const UnicodeString& pattern, |
| const UnicodeString& target, |
| UErrorCode& status); |
| |
| virtual ~StringSearch(void); |
| /** |
| * Assignment operator. Sets this iterator to have the same behavior, |
| * and iterate over the same text, as the one passed in. |
| */ |
| StringSearch& operator=(const StringSearch& that); |
| |
| /** |
| * Equality operator. Returns TRUE if both BreakIterators are of the |
| * same class, have the same behavior, and iterate over the same text. |
| */ |
| virtual UBool operator==(const SearchIterator& that) const; |
| |
| /** |
| * Not-equal operator. If operator== returns TRUE, this returns FALSE, |
| * and vice versa. |
| */ |
| UBool operator!=(const SearchIterator& that) const; |
| |
| /** |
| * Returns a newly-constructed RuleBasedBreakIterator with the same |
| * behavior, and iterating over the same text, as this one. |
| */ |
| virtual SearchIterator* clone(void) const; |
| |
| //------------------------------------------------------------------- |
| // Getters and Setters |
| //------------------------------------------------------------------- |
| |
| /** |
| * Sets this object's strength property. The strength determines the |
| * minimum level of difference considered significant during a |
| * search. Generally, {@link Collator#TERTIARY} and |
| * {@link Collator#IDENTICAL} indicate that all differences are |
| * considered significant, {@link Collator#SECONDARY} indicates |
| * that upper/lower case distinctions should be ignored, and |
| * {@link Collator#PRIMARY} indicates that both case and accents |
| * should be ignored. However, the exact meanings of these constants |
| * are determined by individual Collator objects. |
| * <p> |
| * @see Collator#PRIMARY |
| * @see Collator#SECONDARY |
| * @see Collator#TERTIARY |
| * @see Collator#IDENTICAL |
| */ |
| void setStrength(Collator::ECollationStrength newStrength, UErrorCode& status); |
| |
| |
| /** |
| * Returns this object's strength property, which indicates what level |
| * of differences are considered significant during a search. |
| * <p> |
| * @see #setStrength |
| */ |
| Collator::ECollationStrength getStrength(void) const; |
| |
| /** |
| * Set the collator to be used for this string search. Also changes |
| * the search strength to match that of the new collator. |
| * <p> |
| * This method causes internal data such as Boyer-Moore shift tables |
| * to be recalculated, but the iterator's position is unchanged. |
| * <p> |
| * @see #getCollator |
| */ |
| void setCollator(const RuleBasedCollator* coll, UErrorCode& status); |
| |
| /** |
| * Return the RuleBasedCollator being used for this string search. |
| */ |
| const RuleBasedCollator& getCollator() const; |
| |
| /** |
| * Set the pattern for which to search. |
| * This method causes internal data such as Boyer-Moore shift tables |
| * to be recalculated, but the iterator's position is unchanged. |
| */ |
| void setPattern(const UnicodeString& pat, UErrorCode& status); |
| |
| /** |
| * Returns the pattern for which this object is searching. |
| */ |
| const UnicodeString& getPattern() const; |
| |
| /** |
| * Set the target text which should be searched and resets the |
| * iterator's position to point before the start of the new text. |
| * This method is useful if you want to re-use an iterator to |
| * search for the same pattern within a different body of text. |
| */ |
| virtual void setTarget(const UnicodeString& newText); |
| |
| /** |
| * Set the target text which should be searched and resets the |
| * iterator's position to point before the start of the target text. |
| * This method is useful if you want to re-use an iterator to |
| * search for the same pattern within a different body of text. |
| * |
| * @see #getTarget |
| */ |
| virtual void adoptTarget(CharacterIterator* iterator); |
| |
| /** Reset iterator |
| */ |
| virtual void reset(void); |
| /** |
| * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. |
| * This method is to implement a simple version of RTTI, since not all |
| * C++ compilers support genuine RTTI. Polymorphic operator==() and |
| * clone() methods call this method. |
| * |
| * @return The class ID for this object. All objects of a |
| * given class have the same class ID. Objects of |
| * other classes have different class IDs. |
| */ |
| inline virtual UClassID getDynamicClassID(void) const; |
| |
| /** |
| * Returns the class ID for this class. This is useful only for |
| * comparing to a return value from getDynamicClassID(). For example: |
| * |
| * Base* polymorphic_pointer = createPolymorphicObject(); |
| * if (polymorphic_pointer->getDynamicClassID() == |
| * Derived::getStaticClassID()) ... |
| * |
| * @return The class ID for all objects of this class. |
| */ |
| inline static UClassID getStaticClassID(void); |
| |
| protected: |
| //------------------------------------------------------------------- |
| // Privates |
| //------------------------------------------------------------------- |
| |
| /** |
| * Search forward for matching text, starting at a given location. |
| * Clients should not call this method directly; instead they should call |
| * {@link SearchIterator#next}. |
| * <p> |
| * If a match is found, this method returns the index at which the match |
| * starts and calls {@link SearchIterator#setMatchLength} |
| * with the number of characters in the target |
| * text that make up the match. If no match is found, the method returns |
| * <code>DONE</code> and does not call <tt>setMatchLength</tt>. |
| * <p> |
| * @param start The index in the target text at which the search starts. |
| * |
| * @return The index at which the matched text in the target starts, or DONE |
| * if no match was found. |
| * <p> |
| * @see SearchIterator#next |
| * @see SearchIterator#DONE |
| */ |
| virtual int32_t handleNext(int32_t start, UErrorCode& status); |
| /** |
| * Search backward for matching text ,starting at a given location. |
| * Clients should not call this method directly; instead they should call |
| * <code>SearchIterator.previous()</code>, which this method overrides. |
| * <p> |
| * If a match is found, this method returns the index at which the match |
| * starts and calls {@link SearchIterator#setMatchLength} |
| * with the number of characters in the target |
| * text that make up the match. If no match is found, the method returns |
| * <code>DONE</code> and does not call <tt>setMatchLength</tt>. |
| * <p> |
| * @param start The index in the target text at which the search starts. |
| * |
| * @return The index at which the matched text in the target starts, or DONE |
| * if no match was found. |
| * <p> |
| * @see SearchIterator#previous |
| * @see SearchIterator#DONE |
| */ |
| virtual int32_t handlePrev(int32_t start, UErrorCode& status); |
| private: |
| /** |
| * Return a bitmask that will select only the portions of a collation |
| * element that are significant at the given strength level. |
| */ |
| static int32_t getMask(Collator::ECollationStrength strength); |
| |
| |
| void initialize(UErrorCode& status); |
| /** |
| * Method used by StringSearch to determine how far to the right to |
| * shift the pattern during a Boyer-Moore search. |
| * |
| * @param curValue The current value in the target text |
| * @param curIndex The index in the pattern at which we failed to match |
| * curValue in the target text. |
| */ |
| int32_t getShift( int32_t curValue, int32_t curIndex ) const; |
| |
| /** |
| * Method used by StringSearch to determine how far to the left to |
| * shift the pattern during a reverse Boyer-Moore search. |
| * |
| * @param curValue The current value in the target text |
| * @param curIndex The index in the pattern at which we failed to match |
| * curValue in the target text. |
| */ |
| int32_t getBackShift( int32_t curValue, int32_t curIndex ) const; |
| |
| /** |
| * Hash a collation element from its full size (32 bits) down into a |
| * value that can be used as an index into the shift tables. Right |
| * now we do a modulus by the size of the hash table. |
| * |
| * TODO: At some point I should experiment to see whether a slightly |
| * more complicated hash function gives us a better distribution |
| * on multilingual text. I doubt it will have much effect on |
| * performance, though. |
| */ |
| static int32_t hash(int32_t order); |
| |
| //------------------------------------------------------------------------ |
| // Private Data |
| // |
| CollationElementIterator *iter; |
| RuleBasedCollator *collator; |
| /* HSYS ? Why? Changes to this will not affect collator. no changes to the comparsion result */ |
| Collator::ECollationStrength strength; |
| |
| //------------------------------------------------------------------------ |
| // Everything from here on down is the data used to represent the |
| // Boyer-Moore shift tables and the code that generates and manipulates |
| // them. |
| // |
| int32_t *valueList; |
| int32_t valueListLen; |
| int32_t shiftTable[256]; |
| int32_t backShiftTable[256]; |
| |
| UnicodeString pattern; // The pattern string |
| int32_t normLen; // num. of collation elements in pattern. |
| int32_t minLen; // Min of composed, decomposed versions |
| int32_t maxLen; // Max |
| CollationElementIterator *it; // to be removed |
| |
| private: |
| /* to be removed */ |
| void dumpTables(); |
| /** |
| * Class ID |
| */ |
| static char fgClassID; |
| }; |
| |
| inline UBool StringSearch::operator!=(const SearchIterator& that) const |
| { |
| return !operator==(that); |
| } |
| |
| inline UClassID StringSearch::getDynamicClassID(void) const |
| { |
| return StringSearch::getStaticClassID(); |
| } |
| |
| inline UClassID StringSearch::getStaticClassID(void) |
| { |
| return (UClassID)(&fgClassID); |
| } |
| |
| |
| #endif |
| |