blob: 2f0edd6c4f26e4564dc42326b6e92a36a2353723 [file] [log] [blame]
// © 2022 and later: Unicode, Inc. and others.
// License & terms of use:
#include "hash.h"
#include "unicode/uniset.h"
#include "unicode/utext.h"
#include "uvectr32.h"
* A machine learning break engine for the phrase breaking in Japanese.
class MlBreakEngine : public UMemory {
* Constructor.
* @param digitOrOpenPunctuationOrAlphabetSet An UnicodeSet with the digit, open punctuation and
* alphabet.
* @param closePunctuationSet An UnicodeSet with close punctuation.
* @param status Information on any errors encountered.
MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
const UnicodeSet &closePunctuationSet, UErrorCode &status);
* Virtual destructor.
virtual ~MlBreakEngine();
* Divide up a range of characters handled by this break engine.
* @param inText A UText representing the text
* @param rangeStart The start of the range of the characters
* @param rangeEnd The end of the range of the characters
* @param foundBreaks Output of C array of int32_t break positions, or 0
* @param inString The normalized string of text ranging from rangeStart to rangeEnd
* @param inputMap The vector storing the native index of inText
* @param status Information on any errors encountered.
* @return The number of breaks found
int32_t divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
UVector32 &foundBreaks, const UnicodeString &inString,
const LocalPointer<UVector32> &inputMap, UErrorCode &status) const;
* Load the machine learning's model file.
* @param error Information on any errors encountered.
void loadMLModel(UErrorCode &error);
* Initialize the element list from the input string.
* @param inString A input string to be segmented.
* @param elementList A list to store the first six characters.
* @param status Information on any errors encountered.
* @return The number of code units of the first six characters in inString.
int32_t initElementList(const UnicodeString &inString, UChar32* elementList,
UErrorCode &status) const;
* Evaluate whether the index is a potential breakpoint.
* @param elementList A list including six elements for the breakpoint evaluation.
* @param index The breakpoint index to be evaluated.
* @param numBreaks The accumulated number of breakpoints.
* @param boundary A vector including the index of the breakpoint.
* @param status Information on any errors encountered.
void evaluateBreakpoint(UChar32* elementList, int32_t index, int32_t &numBreaks,
UVector32 &boundary, UErrorCode &status) const;
UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
UnicodeSet fClosePunctuationSet;
Hashtable fModel;
int32_t fNegativeSum;