blob: 07ae02de33ef6ee57b7ecb1b255f343175d855b3 [file] [log] [blame]
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 1999-2011, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#ifndef RBT_PARS_H
#define RBT_PARS_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_TRANSLITERATION
#ifdef __cplusplus
#include "unicode/uobject.h"
#include "unicode/parseerr.h"
#include "unicode/unorm.h"
#include "rbt.h"
#include "hash.h"
#include "uvector.h"
U_NAMESPACE_BEGIN
class TransliterationRuleData;
class UnicodeFunctor;
class ParseData;
class RuleHalf;
class ParsePosition;
class StringMatcher;
class TransliteratorParser : public UMemory {
public:
/**
* A Vector of TransliterationRuleData objects, one for each discrete group
* of rules in the rule set
*/
UVector dataVector;
/**
* PUBLIC data member.
* A Vector of UnicodeStrings containing all of the ID blocks in the rule set
*/
UVector idBlockVector;
/**
* PUBLIC data member containing the parsed compound filter, if any.
*/
UnicodeSet* compoundFilter;
private:
/**
* The current data object for which we are parsing rules
*/
TransliterationRuleData* curData;
UTransDirection direction;
/**
* Parse error information.
*/
UParseError parseError;
/**
* Temporary symbol table used during parsing.
*/
ParseData* parseData;
/**
* Temporary vector of matcher variables. When parsing is complete, this
* is copied into the array data.variables. As with data.variables,
* element 0 corresponds to character data.variablesBase.
*/
UVector variablesVector;
/**
* Temporary table of variable names. When parsing is complete, this is
* copied into data.variableNames.
*/
Hashtable variableNames;
/**
* String of standins for segments. Used during the parsing of a single
* rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
* to StringMatcher object segmentObjects.elementAt(0), etc.
*/
UnicodeString segmentStandins;
/**
* Vector of StringMatcher objects for segments. Used during the
* parsing of a single rule.
* segmentStandins.charAt(0) is the standin for "$1" and corresponds
* to StringMatcher object segmentObjects.elementAt(0), etc.
*/
UVector segmentObjects;
/**
* The next available stand-in for variables. This starts at some point in
* the private use area (discovered dynamically) and increments up toward
* <code>variableLimit</code>. At any point during parsing, available
* variables are <code>variableNext..variableLimit-1</code>.
*/
UChar variableNext;
/**
* The last available stand-in for variables. This is discovered
* dynamically. At any point during parsing, available variables are
* <code>variableNext..variableLimit-1</code>.
*/
UChar variableLimit;
/**
* When we encounter an undefined variable, we do not immediately signal
* an error, in case we are defining this variable, e.g., "$a = [a-z];".
* Instead, we save the name of the undefined variable, and substitute
* in the placeholder char variableLimit - 1, and decrement
* variableLimit.
*/
UnicodeString undefinedVariableName;
/**
* The stand-in character for the 'dot' set, represented by '.' in
* patterns. This is allocated the first time it is needed, and
* reused thereafter.
*/
UChar dotStandIn;
public:
/**
* Constructor.
*/
TransliteratorParser(UErrorCode &statusReturn);
/**
* Destructor.
*/
~TransliteratorParser();
/**
* Parse the given string as a sequence of rules, separated by newline
* characters ('\n'), and cause this object to implement those rules. Any
* previous rules are discarded. Typically this method is called exactly
* once after construction.
*
* Parse the given rules, in the given direction. After this call
* returns, query the public data members for results. The caller
* owns the 'data' and 'compoundFilter' data members after this
* call returns.
* @param rules rules, separated by ';'
* @param direction either FORWARD or REVERSE.
* @param pe Struct to recieve information on position
* of error if an error is encountered
* @param ec Output param set to success/failure code.
*/
void parse(const UnicodeString& rules,
UTransDirection direction,
UParseError& pe,
UErrorCode& ec);
/**
* Return the compound filter parsed by parse(). Caller owns result.
* @return the compound filter parsed by parse().
*/
UnicodeSet* orphanCompoundFilter();
private:
/**
* Return a representation of this transliterator as source rules.
* @param rules Output param to receive the rules.
* @param direction either FORWARD or REVERSE.
*/
void parseRules(const UnicodeString& rules,
UTransDirection direction,
UErrorCode& status);
/**
* MAIN PARSER. Parse the next rule in the given rule string, starting
* at pos. Return the index after the last character parsed. Do not
* parse characters at or after limit.
*
* Important: The character at pos must be a non-whitespace character
* that is not the comment character.
*
* This method handles quoting, escaping, and whitespace removal. It
* parses the end-of-rule character. It recognizes context and cursor
* indicators. Once it does a lexical breakdown of the rule at pos, it
* creates a rule object and adds it to our rule list.
* @param rules Output param to receive the rules.
* @param pos the starting position.
* @param limit pointer past the last character of the rule.
* @return the index after the last character parsed.
*/
int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
/**
* Set the variable range to [start, end] (inclusive).
* @param start the start value of the range.
* @param end the end value of the range.
*/
void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
/**
* Assert that the given character is NOT within the variable range.
* If it is, return false. This is necessary to ensure that the
* variable range does not overlap characters used in a rule.
* @param ch the given character.
* @return True, if the given character is NOT within the variable range.
*/
UBool checkVariableRange(UChar32 ch) const;
/**
* Set the maximum backup to 'backup', in response to a pragma
* statement.
* @param backup the new value to be set.
*/
void pragmaMaximumBackup(int32_t backup);
/**
* Begin normalizing all rules using the given mode, in response
* to a pragma statement.
* @param mode the given mode.
*/
void pragmaNormalizeRules(UNormalizationMode mode);
/**
* Return true if the given rule looks like a pragma.
* @param pos offset to the first non-whitespace character
* of the rule.
* @param limit pointer past the last character of the rule.
* @return true if the given rule looks like a pragma.
*/
static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
/**
* Parse a pragma. This method assumes resemblesPragma() has
* already returned true.
* @param pos offset to the first non-whitespace character
* of the rule.
* @param limit pointer past the last character of the rule.
* @return the position index after the final ';' of the pragma,
* or -1 on failure.
*/
int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
/**
* Called by main parser upon syntax error. Search the rule string
* for the probable end of the rule. Of course, if the error is that
* the end of rule marker is missing, then the rule end will not be found.
* In any case the rule start will be correctly reported.
* @param parseErrorCode error code.
* @param msg error description.
* @param start position of first character of current rule.
* @return start position of first character of current rule.
*/
int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
UErrorCode& status);
/**
* Parse a UnicodeSet out, store it, and return the stand-in character
* used to represent it.
*
* @param rule the rule for UnicodeSet.
* @param pos the position in pattern at which to start parsing.
* @return the stand-in character used to represent it.
*/
UChar parseSet(const UnicodeString& rule,
ParsePosition& pos,
UErrorCode& status);
/**
* Generate and return a stand-in for a new UnicodeFunctor. Store
* the matcher (adopt it).
* @param adopted the UnicodeFunctor to be adopted.
* @return a stand-in for a new UnicodeFunctor.
*/
UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
/**
* Return the standin for segment seg (1-based).
* @param seg the given segment.
* @return the standIn character for the given segment.
*/
UChar getSegmentStandin(int32_t seg, UErrorCode& status);
/**
* Set the object for segment seg (1-based).
* @param seg the given segment.
* @param adopted the StringMatcher to be adopted.
*/
void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
/**
* Return the stand-in for the dot set. It is allocated the first
* time and reused thereafter.
* @return the stand-in for the dot set.
*/
UChar getDotStandIn(UErrorCode& status);
/**
* Append the value of the given variable name to the given
* UnicodeString.
* @param name the variable name to be appended.
* @param buf the given UnicodeString to append to.
*/
void appendVariableDef(const UnicodeString& name,
UnicodeString& buf,
UErrorCode& status);
/**
* Glue method to get around access restrictions in C++.
*/
/*static Transliterator* createBasicInstance(const UnicodeString& id,
const UnicodeString* canonID);*/
friend class RuleHalf;
// Disallowed methods; no impl.
/**
* Copy constructor
*/
TransliteratorParser(const TransliteratorParser&);
/**
* Assignment operator
*/
TransliteratorParser& operator=(const TransliteratorParser&);
};
U_NAMESPACE_END
#endif /* #ifdef __cplusplus */
/**
* Strip/convert the following from the transliterator rules:
* comments
* newlines
* white space at the beginning and end of a line
* unescape \u notation
*
* The target must be equal in size as the source.
* @internal
*/
U_CAPI int32_t
utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);
#endif /* #if !UCONFIG_NO_TRANSLITERATION */
#endif