source/i18n/rbt_pars.h - external/github.com/unicode-org/icu - Git at Google

 /*
 * Copyright (C) {1999}, International Business Machines Corporation and others. All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
 *   11/17/99    aliu        Creation.
 **********************************************************************
 */
 #ifndef RBT_PARS_H
 #define RBT_PARS_H

 #include "unicode/rbt.h"
 #include "unicode/parseerr.h"
 #include "unicode/unorm.h"

 U_NAMESPACE_BEGIN

 class TransliterationRuleData;
 class UnicodeMatcher;
 class ParseData;
 class RuleHalf;
 class ParsePosition;
 class UVector;

 class TransliteratorParser {

  public:

     /**
      * PUBLIC data member containing the parsed data object, or null if
      * there were no rules.
      */
     TransliterationRuleData* data;

     /**
      * PUBLIC data member.
      * The block of ::IDs, both at the top and at the bottom.
      * Inserted into these may be additional rules at the
      * idSplitPoint.
      */
     UnicodeString idBlock;

     /**
      * PUBLIC data member.
      * In a compound RBT, the index at which the RBT rules are
      * inserted into the ID block.  Index 0 means before any IDs
      * in the block.  Index idBlock.length() means after all IDs
      * in the block.  Index is a string index.
      */
     int32_t idSplitPoint;

     /**
      * PUBLIC data member containing the parsed compound filter, if any.
      */
     UnicodeSet* compoundFilter;

  private:

     // The number of rules parsed.  This tells us if there were
     // any actual transliterator rules, or if there were just ::ID
     // block IDs.
     int32_t ruleCount;

     UTransDirection direction;

     /**
      * We use a single error code during parsing.  Rather than pass it
      * through each API, we keep it here.
      */
     UErrorCode status;

     /**
      * Parse error information.
      */
     UParseError parseError;

     /**
      * Temporary symbol table used during parsing.
      */
     ParseData* parseData;

     /**
      * Temporary vector of matcher variables.  When parsing is complete, this
      * is copied into the array data.variables.  As with data.variables,
      * element 0 corresponds to character data.variablesBase.
      */
     UVector* variablesVector;

     /**
      * The next available stand-in for variables.  This starts at some point in
      * the private use area (discovered dynamically) and increments up toward
      * <code>variableLimit</code>.  At any point during parsing, available
      * variables are <code>variableNext..variableLimit-1</code>.
      */
     UChar variableNext;

     /**
      * The last available stand-in for variables.  This is discovered
      * dynamically.  At any point during parsing, available variables are
      * <code>variableNext..variableLimit-1</code>.
      */
     UChar variableLimit;

     /**
      * When we encounter an undefined variable, we do not immediately signal
      * an error, in case we are defining this variable, e.g., "$a = [a-z];".
      * Instead, we save the name of the undefined variable, and substitute
      * in the placeholder char variableLimit - 1, and decrement
      * variableLimit.
      */
     UnicodeString undefinedVariableName;

     /**
      * The stand-in character for the 'dot' set, represented by '.' in
      * patterns.  This is allocated the first time it is needed, and
      * reused thereafter.
      */
     UChar dotStandIn;

 public:

     /**
      * Constructor.
      */
     TransliteratorParser();

     /**
      * Destructor.
      */
     ~TransliteratorParser();

     /**
      * Parse the given string as a sequence of rules, separated by newline
      * characters ('\n'), and cause this object to implement those rules.  Any
      * previous rules are discarded.  Typically this method is called exactly
      * once after construction.
      *
      * Parse the given rules, in the given direction.  After this call
      * returns, query the public data members for results.  The caller
      * owns the 'data' and 'compoundFilter' data members after this
      * call returns.
      */
     void parse(const UnicodeString& rules,
                UTransDirection direction,
                UParseError& pe,
                UErrorCode& ec);

     /**
      * Return the compound filter parsed by parse().  Caller owns result.
      */
     UnicodeSet* orphanCompoundFilter();

     /**
      * Return the data object parsed by parse().  Caller owns result.
      */
     TransliterationRuleData* orphanData();

 private:

     void parseRules(const UnicodeString& rules,
                     UTransDirection direction);

     /**
      * MAIN PARSER.  Parse the next rule in the given rule string, starting
      * at pos.  Return the index after the last character parsed.  Do not
      * parse characters at or after limit.
      *
      * Important:  The character at pos must be a non-whitespace character
      * that is not the comment character.
      *
      * This method handles quoting, escaping, and whitespace removal.  It
      * parses the end-of-rule character.  It recognizes context and cursor
      * indicators.  Once it does a lexical breakdown of the rule at pos, it
      * creates a rule object and adds it to our rule list.
      */
     int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit);

     /**
      * Set the variable range to [start, end] (inclusive).
      */
     void setVariableRange(int32_t start, int32_t end);

     /**
      * Set the maximum backup to 'backup', in response to a pragma
      * statement.
      */
     void pragmaMaximumBackup(int32_t backup);

     /**
      * Begin normalizing all rules using the given mode, in response
      * to a pragma statement.
      */
     void pragmaNormalizeRules(UNormalizationMode mode);

     /**
      * Return true if the given rule looks like a pragma.
      * @param pos offset to the first non-whitespace character
      * of the rule.
      * @param limit pointer past the last character of the rule.
      */
     static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);

     /**
      * Parse a pragma.  This method assumes resemblesPragma() has
      * already returned true.
      * @param pos offset to the first non-whitespace character
      * of the rule.
      * @param limit pointer past the last character of the rule.
      * @return the position index after the final ';' of the pragma,
      * or -1 on failure.
      */
     int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit);

     /**
      * Return true if the given string looks like valid output, that is,
      * does not contain quantifiers or other special input-only elements.
      */
     UBool isValidOutput(const UnicodeString& output) const;

     /**
      * Called by main parser upon syntax error.  Search the rule string
      * for the probable end of the rule.  Of course, if the error is that
      * the end of rule marker is missing, then the rule end will not be found.
      * In any case the rule start will be correctly reported.
      * @param msg error description
      * @param rule pattern string
      * @param start position of first character of current rule
      */
     int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start);

     /**
      * Parse a UnicodeSet out, store it, and return the stand-in character
      * used to represent it.
      */
     UChar parseSet(const UnicodeString& rule,
                    ParsePosition& pos);

     /**
      * Generate and return a stand-in for a new UnicodeMatcher.  Store
      * the matcher (adopt it).
      */
     UChar generateStandInFor(UnicodeMatcher* adopted);

     /**
      * Return the stand-in for the dot set.  It is allocated the first
      * time and reused thereafter.
      */
     UChar getDotStandIn();

     /**
      * Append the value of the given variable name to the given
      * UnicodeString.
      */
     void appendVariableDef(const UnicodeString& name,
                            UnicodeString& buf);

     /**
      * Return a stand-in character that refers to the given segments.
      * @param r a reference number >= 1
      * @return a stand-in for the given segment reference
      */
     UChar getSegmentStandin(int32_t r);

     /**
      * Returns the index of a character, ignoring quoted text.
      * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
      * found by a search for 'h'.
      * @param text text to be searched
      * @param start the beginning index, inclusive; <code>0 <= start
      * <= limit</code>.
      * @param limit the ending index, exclusive; <code>start <= limit
      * <= text.length()</code>.
      * @param c character to search for
      * @return Offset of the first instance of c, or -1 if not found.
      */
     static int32_t quotedIndexOf(const UnicodeString& text,
                                  int32_t start, int32_t limit,
                                  UChar c);

     //------------------------------------------------------------
     // Utility methods -- temporarily here
     //------------------------------------------------------------

     /**
      * Skip over a sequence of zero or more white space characters
      * at pos.  Return the index of the first non-white-space character
      * at or after pos, or str.length(), if there is none.
      */
     static int32_t skipWhitespace(const UnicodeString& str, int32_t pos);

     /**
      * Parse a pattern string starting at offset pos.  Keywords are
      * matched case-insensitively.  Spaces may be skipped and may be
      * optional or required.  Integer values may be parsed, and if
      * they are, they will be returned in the given array.  If
      * successful, the offset of the next non-space character is
      * returned.  On failure, -1 is returned.
      * @param pattern must only contain lowercase characters, which
      * will match their uppercase equivalents as well.  A space
      * character matches one or more required spaces.  A '~' character
      * matches zero or more optional spaces.  A '#' character matches
      * an integer and stores it in parsedInts, which the caller must
      * ensure has enough capacity.
      * @param parsedInts array to receive parsed integers.  Caller
      * must ensure that parsedInts.length is >= the number of '#'
      * signs in 'pattern'.
      * @return the position after the last character parsed, or -1 if
      * the parse failed
      */
     static int32_t parsePattern(const UnicodeString& rule, int32_t pos, int32_t limit,
                                 const UnicodeString& pattern, int32_t* parsedInts);

     /**
      * Parse an integer at pos, either of the form \d+ or of the form
      * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
      * or octal format.
      * @param pos INPUT-OUTPUT parameter.  On input, the first
      * character to parse.  On output, the character after the last
      * parsed character.
      */
     static int32_t parseInteger(const UnicodeString& rule, int32_t& pos, int32_t limit);


     friend class RuleHalf;

     // Disallowed methods; no impl.
     TransliteratorParser(const TransliteratorParser&);
     TransliteratorParser& operator=(const TransliteratorParser&);
 };

 U_NAMESPACE_END

 #endif
	/*
	* Copyright (C) {1999}, International Business Machines Corporation and others. All Rights Reserved.
	**********************************************************************
	* Date Name Description
	* 11/17/99 aliu Creation.
	**********************************************************************
	*/
	#ifndef RBT_PARS_H
	#define RBT_PARS_H

	#include "unicode/rbt.h"
	#include "unicode/parseerr.h"
	#include "unicode/unorm.h"

	U_NAMESPACE_BEGIN

	class TransliterationRuleData;
	class UnicodeMatcher;
	class ParseData;
	class RuleHalf;
	class ParsePosition;
	class UVector;

	class TransliteratorParser {

	public:

	/**
	* PUBLIC data member containing the parsed data object, or null if
	* there were no rules.
	*/
	TransliterationRuleData* data;

	/**
	* PUBLIC data member.
	* The block of ::IDs, both at the top and at the bottom.
	* Inserted into these may be additional rules at the
	* idSplitPoint.
	*/
	UnicodeString idBlock;

	/**
	* PUBLIC data member.
	* In a compound RBT, the index at which the RBT rules are
	* inserted into the ID block. Index 0 means before any IDs
	* in the block. Index idBlock.length() means after all IDs
	* in the block. Index is a string index.
	*/
	int32_t idSplitPoint;

	/**
	* PUBLIC data member containing the parsed compound filter, if any.
	*/
	UnicodeSet* compoundFilter;

	private:

	// The number of rules parsed. This tells us if there were
	// any actual transliterator rules, or if there were just ::ID
	// block IDs.
	int32_t ruleCount;

	UTransDirection direction;

	/**
	* We use a single error code during parsing. Rather than pass it
	* through each API, we keep it here.
	*/
	UErrorCode status;

	/**
	* Parse error information.
	*/
	UParseError parseError;

	/**
	* Temporary symbol table used during parsing.
	*/
	ParseData* parseData;

	/**
	* Temporary vector of matcher variables. When parsing is complete, this
	* is copied into the array data.variables. As with data.variables,
	* element 0 corresponds to character data.variablesBase.
	*/
	UVector* variablesVector;

	/**
	* The next available stand-in for variables. This starts at some point in
	* the private use area (discovered dynamically) and increments up toward
	* <code>variableLimit</code>. At any point during parsing, available
	* variables are <code>variableNext..variableLimit-1</code>.
	*/
	UChar variableNext;

	/**
	* The last available stand-in for variables. This is discovered
	* dynamically. At any point during parsing, available variables are
	* <code>variableNext..variableLimit-1</code>.
	*/
	UChar variableLimit;

	/**
	* When we encounter an undefined variable, we do not immediately signal
	* an error, in case we are defining this variable, e.g., "$a = [a-z];".
	* Instead, we save the name of the undefined variable, and substitute
	* in the placeholder char variableLimit - 1, and decrement
	* variableLimit.
	*/
	UnicodeString undefinedVariableName;

	/**
	* The stand-in character for the 'dot' set, represented by '.' in
	* patterns. This is allocated the first time it is needed, and
	* reused thereafter.
	*/
	UChar dotStandIn;

	public:

	/**
	* Constructor.
	*/
	TransliteratorParser();

	/**
	* Destructor.
	*/
	~TransliteratorParser();

	/**
	* Parse the given string as a sequence of rules, separated by newline
	* characters ('\n'), and cause this object to implement those rules. Any
	* previous rules are discarded. Typically this method is called exactly
	* once after construction.
	*
	* Parse the given rules, in the given direction. After this call
	* returns, query the public data members for results. The caller
	* owns the 'data' and 'compoundFilter' data members after this
	* call returns.
	*/
	void parse(const UnicodeString& rules,
	UTransDirection direction,
	UParseError& pe,
	UErrorCode& ec);

	/**
	* Return the compound filter parsed by parse(). Caller owns result.
	*/
	UnicodeSet* orphanCompoundFilter();

	/**
	* Return the data object parsed by parse(). Caller owns result.
	*/
	TransliterationRuleData* orphanData();

	private:

	void parseRules(const UnicodeString& rules,
	UTransDirection direction);

	/**
	* MAIN PARSER. Parse the next rule in the given rule string, starting
	* at pos. Return the index after the last character parsed. Do not
	* parse characters at or after limit.
	*
	* Important: The character at pos must be a non-whitespace character
	* that is not the comment character.
	*
	* This method handles quoting, escaping, and whitespace removal. It
	* parses the end-of-rule character. It recognizes context and cursor
	* indicators. Once it does a lexical breakdown of the rule at pos, it
	* creates a rule object and adds it to our rule list.
	*/
	int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit);

	/**
	* Set the variable range to [start, end] (inclusive).
	*/
	void setVariableRange(int32_t start, int32_t end);

	/**
	* Set the maximum backup to 'backup', in response to a pragma
	* statement.
	*/
	void pragmaMaximumBackup(int32_t backup);

	/**
	* Begin normalizing all rules using the given mode, in response
	* to a pragma statement.
	*/
	void pragmaNormalizeRules(UNormalizationMode mode);

	/**
	* Return true if the given rule looks like a pragma.
	* @param pos offset to the first non-whitespace character
	* of the rule.
	* @param limit pointer past the last character of the rule.
	*/
	static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);

	/**
	* Parse a pragma. This method assumes resemblesPragma() has
	* already returned true.
	* @param pos offset to the first non-whitespace character
	* of the rule.
	* @param limit pointer past the last character of the rule.
	* @return the position index after the final ';' of the pragma,
	* or -1 on failure.
	*/
	int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit);

	/**
	* Return true if the given string looks like valid output, that is,
	* does not contain quantifiers or other special input-only elements.
	*/
	UBool isValidOutput(const UnicodeString& output) const;

	/**
	* Called by main parser upon syntax error. Search the rule string
	* for the probable end of the rule. Of course, if the error is that
	* the end of rule marker is missing, then the rule end will not be found.
	* In any case the rule start will be correctly reported.
	* @param msg error description
	* @param rule pattern string
	* @param start position of first character of current rule
	*/
	int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start);

	/**
	* Parse a UnicodeSet out, store it, and return the stand-in character
	* used to represent it.
	*/
	UChar parseSet(const UnicodeString& rule,
	ParsePosition& pos);

	/**
	* Generate and return a stand-in for a new UnicodeMatcher. Store
	* the matcher (adopt it).
	*/
	UChar generateStandInFor(UnicodeMatcher* adopted);

	/**
	* Return the stand-in for the dot set. It is allocated the first
	* time and reused thereafter.
	*/
	UChar getDotStandIn();

	/**
	* Append the value of the given variable name to the given
	* UnicodeString.
	*/
	void appendVariableDef(const UnicodeString& name,
	UnicodeString& buf);

	/**
	* Return a stand-in character that refers to the given segments.
	* @param r a reference number >= 1
	* @return a stand-in for the given segment reference
	*/
	UChar getSegmentStandin(int32_t r);

	/**
	* Returns the index of a character, ignoring quoted text.
	* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
	* found by a search for 'h'.
	* @param text text to be searched
	* @param start the beginning index, inclusive; <code>0 <= start
	* <= limit</code>.
	* @param limit the ending index, exclusive; <code>start <= limit
	* <= text.length()</code>.
	* @param c character to search for
	* @return Offset of the first instance of c, or -1 if not found.
	*/
	static int32_t quotedIndexOf(const UnicodeString& text,
	int32_t start, int32_t limit,
	UChar c);

	//------------------------------------------------------------
	// Utility methods -- temporarily here
	//------------------------------------------------------------

	/**
	* Skip over a sequence of zero or more white space characters
	* at pos. Return the index of the first non-white-space character
	* at or after pos, or str.length(), if there is none.
	*/
	static int32_t skipWhitespace(const UnicodeString& str, int32_t pos);

	/**
	* Parse a pattern string starting at offset pos. Keywords are
	* matched case-insensitively. Spaces may be skipped and may be
	* optional or required. Integer values may be parsed, and if
	* they are, they will be returned in the given array. If
	* successful, the offset of the next non-space character is
	* returned. On failure, -1 is returned.
	* @param pattern must only contain lowercase characters, which
	* will match their uppercase equivalents as well. A space
	* character matches one or more required spaces. A '~' character
	* matches zero or more optional spaces. A '#' character matches
	* an integer and stores it in parsedInts, which the caller must
	* ensure has enough capacity.
	* @param parsedInts array to receive parsed integers. Caller
	* must ensure that parsedInts.length is >= the number of '#'
	* signs in 'pattern'.
	* @return the position after the last character parsed, or -1 if
	* the parse failed
	*/
	static int32_t parsePattern(const UnicodeString& rule, int32_t pos, int32_t limit,
	const UnicodeString& pattern, int32_t* parsedInts);

	/**
	* Parse an integer at pos, either of the form \d+ or of the form
	* 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
	* or octal format.
	* @param pos INPUT-OUTPUT parameter. On input, the first
	* character to parse. On output, the character after the last
	* parsed character.
	*/
	static int32_t parseInteger(const UnicodeString& rule, int32_t& pos, int32_t limit);


	friend class RuleHalf;

	// Disallowed methods; no impl.
	TransliteratorParser(const TransliteratorParser&);
	TransliteratorParser& operator=(const TransliteratorParser&);
	};

	U_NAMESPACE_END

	#endif