|  | // © 2016 and later: Unicode, Inc. and others. | 
|  | // License & terms of use: http://www.unicode.org/copyright.html | 
|  | /** | 
|  | ************************************************************************************ | 
|  | * Copyright (C) 2006-2012, International Business Machines Corporation and others. * | 
|  | * All Rights Reserved.                                                             * | 
|  | ************************************************************************************ | 
|  | */ | 
|  |  | 
|  | #ifndef BRKENG_H | 
|  | #define BRKENG_H | 
|  |  | 
|  | #include "unicode/utypes.h" | 
|  | #include "unicode/uobject.h" | 
|  | #include "unicode/utext.h" | 
|  | #include "unicode/uscript.h" | 
|  |  | 
|  | U_NAMESPACE_BEGIN | 
|  |  | 
|  | class UnicodeSet; | 
|  | class UStack; | 
|  | class UVector32; | 
|  | class DictionaryMatcher; | 
|  |  | 
|  | /******************************************************************* | 
|  | * LanguageBreakEngine | 
|  | */ | 
|  |  | 
|  | /** | 
|  | * <p>LanguageBreakEngines implement language-specific knowledge for | 
|  | * finding text boundaries within a run of characters belonging to a | 
|  | * specific set. The boundaries will be of a specific kind, e.g. word, | 
|  | * line, etc.</p> | 
|  | * | 
|  | * <p>LanguageBreakEngines should normally be implemented so as to | 
|  | * be shared between threads without locking.</p> | 
|  | */ | 
|  | class LanguageBreakEngine : public UMemory { | 
|  | public: | 
|  |  | 
|  | /** | 
|  | * <p>Default constructor.</p> | 
|  | * | 
|  | */ | 
|  | LanguageBreakEngine(); | 
|  |  | 
|  | /** | 
|  | * <p>Virtual destructor.</p> | 
|  | */ | 
|  | virtual ~LanguageBreakEngine(); | 
|  |  | 
|  | /** | 
|  | * <p>Indicate whether this engine handles a particular character for | 
|  | * a particular kind of break.</p> | 
|  | * | 
|  | * @param c A character which begins a run that the engine might handle | 
|  | * @param breakType The type of text break which the caller wants to determine | 
|  | * @return TRUE if this engine handles the particular character and break | 
|  | * type. | 
|  | */ | 
|  | virtual UBool handles(UChar32 c, int32_t breakType) const = 0; | 
|  |  | 
|  | /** | 
|  | * <p>Find any breaks within a run in the supplied text.</p> | 
|  | * | 
|  | * @param text A UText representing the text. The | 
|  | * iterator is left at the end of the run of characters which the engine | 
|  | * is capable of handling. | 
|  | * @param startPos The start of the run within the supplied text. | 
|  | * @param endPos The end of the run within the supplied text. | 
|  | * @param breakType The type of break desired, or -1. | 
|  | * @param foundBreaks A Vector of int32_t to receive the breaks. | 
|  | * @return The number of breaks found. | 
|  | */ | 
|  | virtual int32_t findBreaks( UText *text, | 
|  | int32_t startPos, | 
|  | int32_t endPos, | 
|  | int32_t breakType, | 
|  | UVector32 &foundBreaks ) const = 0; | 
|  |  | 
|  | }; | 
|  |  | 
|  | /******************************************************************* | 
|  | * LanguageBreakFactory | 
|  | */ | 
|  |  | 
|  | /** | 
|  | * <p>LanguageBreakFactorys find and return a LanguageBreakEngine | 
|  | * that can determine breaks for characters in a specific set, if | 
|  | * such an object can be found.</p> | 
|  | * | 
|  | * <p>If a LanguageBreakFactory is to be shared between threads, | 
|  | * appropriate synchronization must be used; there is none internal | 
|  | * to the factory.</p> | 
|  | * | 
|  | * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can | 
|  | * normally be shared between threads without synchronization, unless | 
|  | * the specific subclass of LanguageBreakFactory indicates otherwise.</p> | 
|  | * | 
|  | * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine | 
|  | * it returns when it itself is deleted, unless the specific subclass of | 
|  | * LanguageBreakFactory indicates otherwise. Naturally, the factory should | 
|  | * not be deleted until the LanguageBreakEngines it has returned are no | 
|  | * longer needed.</p> | 
|  | */ | 
|  | class LanguageBreakFactory : public UMemory { | 
|  | public: | 
|  |  | 
|  | /** | 
|  | * <p>Default constructor.</p> | 
|  | * | 
|  | */ | 
|  | LanguageBreakFactory(); | 
|  |  | 
|  | /** | 
|  | * <p>Virtual destructor.</p> | 
|  | */ | 
|  | virtual ~LanguageBreakFactory(); | 
|  |  | 
|  | /** | 
|  | * <p>Find and return a LanguageBreakEngine that can find the desired | 
|  | * kind of break for the set of characters to which the supplied | 
|  | * character belongs. It is up to the set of available engines to | 
|  | * determine what the sets of characters are.</p> | 
|  | * | 
|  | * @param c A character that begins a run for which a LanguageBreakEngine is | 
|  | * sought. | 
|  | * @param breakType The kind of text break for which a LanguageBreakEngine is | 
|  | * sought. | 
|  | * @return A LanguageBreakEngine with the desired characteristics, or 0. | 
|  | */ | 
|  | virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0; | 
|  |  | 
|  | }; | 
|  |  | 
|  | /******************************************************************* | 
|  | * UnhandledEngine | 
|  | */ | 
|  |  | 
|  | /** | 
|  | * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that | 
|  | * handles characters that no other LanguageBreakEngine is available to | 
|  | * handle. It is told the character and the type of break; at its | 
|  | * discretion it may handle more than the specified character (e.g., | 
|  | * the entire script to which that character belongs.</p> | 
|  | * | 
|  | * <p>UnhandledEngines may not be shared between threads without | 
|  | * external synchronization.</p> | 
|  | */ | 
|  |  | 
|  | class UnhandledEngine : public LanguageBreakEngine { | 
|  | private: | 
|  |  | 
|  | /** | 
|  | * The sets of characters handled, for each break type | 
|  | * @internal | 
|  | */ | 
|  |  | 
|  | UnicodeSet    *fHandled[4]; | 
|  |  | 
|  | public: | 
|  |  | 
|  | /** | 
|  | * <p>Default constructor.</p> | 
|  | * | 
|  | */ | 
|  | UnhandledEngine(UErrorCode &status); | 
|  |  | 
|  | /** | 
|  | * <p>Virtual destructor.</p> | 
|  | */ | 
|  | virtual ~UnhandledEngine(); | 
|  |  | 
|  | /** | 
|  | * <p>Indicate whether this engine handles a particular character for | 
|  | * a particular kind of break.</p> | 
|  | * | 
|  | * @param c A character which begins a run that the engine might handle | 
|  | * @param breakType The type of text break which the caller wants to determine | 
|  | * @return TRUE if this engine handles the particular character and break | 
|  | * type. | 
|  | */ | 
|  | virtual UBool handles(UChar32 c, int32_t breakType) const; | 
|  |  | 
|  | /** | 
|  | * <p>Find any breaks within a run in the supplied text.</p> | 
|  | * | 
|  | * @param text A UText representing the text (TODO: UText). The | 
|  | * iterator is left at the end of the run of characters which the engine | 
|  | * is capable of handling. | 
|  | * @param startPos The start of the run within the supplied text. | 
|  | * @param endPos The end of the run within the supplied text. | 
|  | * @param breakType The type of break desired, or -1. | 
|  | * @param foundBreaks An allocated C array of the breaks found, if any | 
|  | * @return The number of breaks found. | 
|  | */ | 
|  | virtual int32_t findBreaks( UText *text, | 
|  | int32_t startPos, | 
|  | int32_t endPos, | 
|  | int32_t breakType, | 
|  | UVector32 &foundBreaks ) const; | 
|  |  | 
|  | /** | 
|  | * <p>Tell the engine to handle a particular character and break type.</p> | 
|  | * | 
|  | * @param c A character which the engine should handle | 
|  | * @param breakType The type of text break for which the engine should handle c | 
|  | */ | 
|  | virtual void handleCharacter(UChar32 c, int32_t breakType); | 
|  |  | 
|  | }; | 
|  |  | 
|  | /******************************************************************* | 
|  | * ICULanguageBreakFactory | 
|  | */ | 
|  |  | 
|  | /** | 
|  | * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for | 
|  | * ICU. It creates dictionary-based LanguageBreakEngines from dictionary | 
|  | * data in the ICU data file.</p> | 
|  | */ | 
|  | class ICULanguageBreakFactory : public LanguageBreakFactory { | 
|  | private: | 
|  |  | 
|  | /** | 
|  | * The stack of break engines created by this factory | 
|  | * @internal | 
|  | */ | 
|  |  | 
|  | UStack    *fEngines; | 
|  |  | 
|  | public: | 
|  |  | 
|  | /** | 
|  | * <p>Standard constructor.</p> | 
|  | * | 
|  | */ | 
|  | ICULanguageBreakFactory(UErrorCode &status); | 
|  |  | 
|  | /** | 
|  | * <p>Virtual destructor.</p> | 
|  | */ | 
|  | virtual ~ICULanguageBreakFactory(); | 
|  |  | 
|  | /** | 
|  | * <p>Find and return a LanguageBreakEngine that can find the desired | 
|  | * kind of break for the set of characters to which the supplied | 
|  | * character belongs. It is up to the set of available engines to | 
|  | * determine what the sets of characters are.</p> | 
|  | * | 
|  | * @param c A character that begins a run for which a LanguageBreakEngine is | 
|  | * sought. | 
|  | * @param breakType The kind of text break for which a LanguageBreakEngine is | 
|  | * sought. | 
|  | * @return A LanguageBreakEngine with the desired characteristics, or 0. | 
|  | */ | 
|  | virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType); | 
|  |  | 
|  | protected: | 
|  | /** | 
|  | * <p>Create a LanguageBreakEngine for the set of characters to which | 
|  | * the supplied character belongs, for the specified break type.</p> | 
|  | * | 
|  | * @param c A character that begins a run for which a LanguageBreakEngine is | 
|  | * sought. | 
|  | * @param breakType The kind of text break for which a LanguageBreakEngine is | 
|  | * sought. | 
|  | * @return A LanguageBreakEngine with the desired characteristics, or 0. | 
|  | */ | 
|  | virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType); | 
|  |  | 
|  | /** | 
|  | * <p>Create a DictionaryMatcher for the specified script and break type.</p> | 
|  | * @param script An ISO 15924 script code that identifies the dictionary to be | 
|  | * created. | 
|  | * @param breakType The kind of text break for which a dictionary is | 
|  | * sought. | 
|  | * @return A DictionaryMatcher with the desired characteristics, or NULL. | 
|  | */ | 
|  | virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType); | 
|  | }; | 
|  |  | 
|  | U_NAMESPACE_END | 
|  |  | 
|  | /* BRKENG_H */ | 
|  | #endif |