| /** | 
 |  ******************************************************************************* | 
 |  * Copyright (C) 2006,2012, International Business Machines Corporation        * | 
 |  * and others. All Rights Reserved.                                            * | 
 |  ******************************************************************************* | 
 |  */ | 
 |  | 
 | #ifndef DICTBE_H | 
 | #define DICTBE_H | 
 |  | 
 | #include "unicode/utypes.h" | 
 | #include "unicode/uniset.h" | 
 | #include "unicode/utext.h" | 
 |  | 
 | #include "brkeng.h" | 
 |  | 
 | U_NAMESPACE_BEGIN | 
 |  | 
 | class DictionaryMatcher; | 
 |  | 
 | /******************************************************************* | 
 |  * DictionaryBreakEngine | 
 |  */ | 
 |  | 
 | /** | 
 |  * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a | 
 |  * dictionary to determine language-specific breaks.</p> | 
 |  * | 
 |  * <p>After it is constructed a DictionaryBreakEngine may be shared between | 
 |  * threads without synchronization.</p> | 
 |  */ | 
 | class DictionaryBreakEngine : public LanguageBreakEngine { | 
 |  private: | 
 |     /** | 
 |      * The set of characters handled by this engine | 
 |      * @internal | 
 |      */ | 
 |  | 
 |   UnicodeSet    fSet; | 
 |  | 
 |     /** | 
 |      * The set of break types handled by this engine | 
 |      * @internal | 
 |      */ | 
 |  | 
 |   uint32_t      fTypes; | 
 |  | 
 |   /** | 
 |    * <p>Default constructor.</p> | 
 |    * | 
 |    */ | 
 |   DictionaryBreakEngine(); | 
 |  | 
 |  public: | 
 |  | 
 |   /** | 
 |    * <p>Constructor setting the break types handled.</p> | 
 |    * | 
 |    * @param breakTypes A bitmap of types handled by the engine. | 
 |    */ | 
 |   DictionaryBreakEngine( uint32_t breakTypes ); | 
 |  | 
 |   /** | 
 |    * <p>Virtual destructor.</p> | 
 |    */ | 
 |   virtual ~DictionaryBreakEngine(); | 
 |  | 
 |   /** | 
 |    * <p>Indicate whether this engine handles a particular character for | 
 |    * a particular kind of break.</p> | 
 |    * | 
 |    * @param c A character which begins a run that the engine might handle | 
 |    * @param breakType The type of text break which the caller wants to determine | 
 |    * @return TRUE if this engine handles the particular character and break | 
 |    * type. | 
 |    */ | 
 |   virtual UBool handles( UChar32 c, int32_t breakType ) const; | 
 |  | 
 |   /** | 
 |    * <p>Find any breaks within a run in the supplied text.</p> | 
 |    * | 
 |    * @param text A UText representing the text. The iterator is left at | 
 |    * the end of the run of characters which the engine is capable of handling  | 
 |    * that starts from the first (or last) character in the range. | 
 |    * @param startPos The start of the run within the supplied text. | 
 |    * @param endPos The end of the run within the supplied text. | 
 |    * @param reverse Whether the caller is looking for breaks in a reverse | 
 |    * direction. | 
 |    * @param breakType The type of break desired, or -1. | 
 |    * @param foundBreaks An allocated C array of the breaks found, if any | 
 |    * @return The number of breaks found. | 
 |    */ | 
 |   virtual int32_t findBreaks( UText *text, | 
 |                               int32_t startPos, | 
 |                               int32_t endPos, | 
 |                               UBool reverse, | 
 |                               int32_t breakType, | 
 |                               UStack &foundBreaks ) const; | 
 |  | 
 |  protected: | 
 |  | 
 |  /** | 
 |   * <p>Set the character set handled by this engine.</p> | 
 |   * | 
 |   * @param set A UnicodeSet of the set of characters handled by the engine | 
 |   */ | 
 |   virtual void setCharacters( const UnicodeSet &set ); | 
 |  | 
 |  /** | 
 |   * <p>Set the break types handled by this engine.</p> | 
 |   * | 
 |   * @param breakTypes A bitmap of types handled by the engine. | 
 |   */ | 
 | //  virtual void setBreakTypes( uint32_t breakTypes ); | 
 |  | 
 |  /** | 
 |   * <p>Divide up a range of known dictionary characters handled by this break engine.</p> | 
 |   * | 
 |   * @param text A UText representing the text | 
 |   * @param rangeStart The start of the range of dictionary characters | 
 |   * @param rangeEnd The end of the range of dictionary characters | 
 |   * @param foundBreaks Output of C array of int32_t break positions, or 0 | 
 |   * @return The number of breaks found | 
 |   */ | 
 |   virtual int32_t divideUpDictionaryRange( UText *text, | 
 |                                            int32_t rangeStart, | 
 |                                            int32_t rangeEnd, | 
 |                                            UStack &foundBreaks ) const = 0; | 
 |  | 
 | }; | 
 |  | 
 | /******************************************************************* | 
 |  * ThaiBreakEngine | 
 |  */ | 
 |  | 
 | /** | 
 |  * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a | 
 |  * dictionary and heuristics to determine Thai-specific breaks.</p> | 
 |  * | 
 |  * <p>After it is constructed a ThaiBreakEngine may be shared between | 
 |  * threads without synchronization.</p> | 
 |  */ | 
 | class ThaiBreakEngine : public DictionaryBreakEngine { | 
 |  private: | 
 |     /** | 
 |      * The set of characters handled by this engine | 
 |      * @internal | 
 |      */ | 
 |  | 
 |   UnicodeSet                fThaiWordSet; | 
 |   UnicodeSet                fEndWordSet; | 
 |   UnicodeSet                fBeginWordSet; | 
 |   UnicodeSet                fSuffixSet; | 
 |   UnicodeSet                fMarkSet; | 
 |   DictionaryMatcher  *fDictionary; | 
 |  | 
 |  public: | 
 |  | 
 |   /** | 
 |    * <p>Default constructor.</p> | 
 |    * | 
 |    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the | 
 |    * engine is deleted. | 
 |    */ | 
 |   ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); | 
 |  | 
 |   /** | 
 |    * <p>Virtual destructor.</p> | 
 |    */ | 
 |   virtual ~ThaiBreakEngine(); | 
 |  | 
 |  protected: | 
 |  /** | 
 |   * <p>Divide up a range of known dictionary characters handled by this break engine.</p> | 
 |   * | 
 |   * @param text A UText representing the text | 
 |   * @param rangeStart The start of the range of dictionary characters | 
 |   * @param rangeEnd The end of the range of dictionary characters | 
 |   * @param foundBreaks Output of C array of int32_t break positions, or 0 | 
 |   * @return The number of breaks found | 
 |   */ | 
 |   virtual int32_t divideUpDictionaryRange( UText *text, | 
 |                                            int32_t rangeStart, | 
 |                                            int32_t rangeEnd, | 
 |                                            UStack &foundBreaks ) const; | 
 |  | 
 | }; | 
 |  | 
 | #if !UCONFIG_NO_NORMALIZATION | 
 |  | 
 | /******************************************************************* | 
 |  * CjkBreakEngine | 
 |  */ | 
 |  | 
 | //indicates language/script that the CjkBreakEngine will handle | 
 | enum LanguageType { | 
 |     kKorean, | 
 |     kChineseJapanese | 
 | }; | 
 |  | 
 | /** | 
 |  * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a | 
 |  * dictionary with costs associated with each word and | 
 |  * Viterbi decoding to determine CJK-specific breaks.</p> | 
 |  */ | 
 | class CjkBreakEngine : public DictionaryBreakEngine { | 
 |  protected: | 
 |     /** | 
 |      * The set of characters handled by this engine | 
 |      * @internal | 
 |      */ | 
 |   UnicodeSet                fHangulWordSet; | 
 |   UnicodeSet                fHanWordSet; | 
 |   UnicodeSet                fKatakanaWordSet; | 
 |   UnicodeSet                fHiraganaWordSet; | 
 |  | 
 |   DictionaryMatcher  *fDictionary; | 
 |  | 
 |  public: | 
 |  | 
 |     /** | 
 |      * <p>Default constructor.</p> | 
 |      * | 
 |      * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the | 
 |      * engine is deleted. The DictionaryMatcher must contain costs for each word | 
 |      * in order for the dictionary to work properly. | 
 |      */ | 
 |   CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); | 
 |  | 
 |     /** | 
 |      * <p>Virtual destructor.</p> | 
 |      */ | 
 |   virtual ~CjkBreakEngine(); | 
 |  | 
 |  protected: | 
 |     /** | 
 |      * <p>Divide up a range of known dictionary characters handled by this break engine.</p> | 
 |      * | 
 |      * @param text A UText representing the text | 
 |      * @param rangeStart The start of the range of dictionary characters | 
 |      * @param rangeEnd The end of the range of dictionary characters | 
 |      * @param foundBreaks Output of C array of int32_t break positions, or 0 | 
 |      * @return The number of breaks found | 
 |      */ | 
 |   virtual int32_t divideUpDictionaryRange( UText *text, | 
 |           int32_t rangeStart, | 
 |           int32_t rangeEnd, | 
 |           UStack &foundBreaks ) const; | 
 |  | 
 | }; | 
 |  | 
 | #endif | 
 |  | 
 | /*******************************************************************  | 
 |  * KhmerBreakEngine  | 
 |  */  | 
 |   | 
 | /**  | 
 |  * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a  | 
 |  * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>  | 
 |  *  | 
 |  * <p>After it is constructed a KhmerBreakEngine may be shared between  | 
 |  * threads without synchronization.</p>  | 
 |  */  | 
 | class KhmerBreakEngine : public DictionaryBreakEngine {  | 
 |  private:  | 
 |     /**  | 
 |      * The set of characters handled by this engine  | 
 |      * @internal  | 
 |      */  | 
 |   | 
 |   UnicodeSet                fKhmerWordSet;  | 
 |   UnicodeSet                fEndWordSet;  | 
 |   UnicodeSet                fBeginWordSet;  | 
 |   UnicodeSet                fMarkSet;  | 
 |   DictionaryMatcher  *fDictionary;  | 
 |   | 
 |  public:  | 
 |   | 
 |   /**  | 
 |    * <p>Default constructor.</p>  | 
 |    *  | 
 |    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the  | 
 |    * engine is deleted.  | 
 |    */  | 
 |   KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);  | 
 |   | 
 |   /**  | 
 |    * <p>Virtual destructor.</p>  | 
 |    */  | 
 |   virtual ~KhmerBreakEngine();  | 
 |   | 
 |  protected:  | 
 |  /**  | 
 |   * <p>Divide up a range of known dictionary characters.</p>  | 
 |   *  | 
 |   * @param text A UText representing the text  | 
 |   * @param rangeStart The start of the range of dictionary characters  | 
 |   * @param rangeEnd The end of the range of dictionary characters  | 
 |   * @param foundBreaks Output of C array of int32_t break positions, or 0  | 
 |   * @return The number of breaks found  | 
 |   */  | 
 |   virtual int32_t divideUpDictionaryRange( UText *text,  | 
 |                                            int32_t rangeStart,  | 
 |                                            int32_t rangeEnd,  | 
 |                                            UStack &foundBreaks ) const;  | 
 |   | 
 | };  | 
 |   | 
 |   | 
 | U_NAMESPACE_END | 
 |  | 
 |     /* DICTBE_H */ | 
 | #endif |