source/common/rbbiscan.h - external/github.com/unicode-org/icu - Git at Google

 // Copyright (C) 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html
 //
 //  rbbiscan.h
 //
 //  Copyright (C) 2002-2016, International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
 //  This file contains declarations for class RBBIRuleScanner
 //


 #ifndef RBBISCAN_H
 #define RBBISCAN_H

 #include "unicode/utypes.h"
 #include "unicode/uobject.h"
 #include "unicode/rbbi.h"
 #include "unicode/uniset.h"
 #include "unicode/parseerr.h"
 #include "uhash.h"
 #include "uvector.h"
 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
                           //    looks up references to $variables within a set.
 #include "rbbinode.h"
 #include "rbbirpt.h"

 U_NAMESPACE_BEGIN

 class   RBBIRuleBuilder;
 class   RBBISymbolTable;


 //--------------------------------------------------------------------------------
 //
 //  class RBBIRuleScanner does the lowest level, character-at-a-time
 //                        scanning of break iterator rules.
 //
 //                        The output of the scanner is parse trees for
 //                        the rule expressions and a list of all Unicode Sets
 //                        encountered.
 //
 //--------------------------------------------------------------------------------

 class RBBIRuleScanner : public UMemory {
 public:

     enum {
         kStackSize = 100            // The size of the state stack for
     };                              //   rules parsing.  Corresponds roughly
                                     //   to the depth of parentheses nesting
                                     //   that is allowed in the rules.

     struct RBBIRuleChar {
         UChar32             fChar;
         UBool               fEscaped;
         RBBIRuleChar() : fChar(0), fEscaped(FALSE) {};
     };

     RBBIRuleScanner(RBBIRuleBuilder  *rb);


     virtual    ~RBBIRuleScanner();

     void        nextChar(RBBIRuleChar &c);          // Get the next char from the input stream.
                                                     // Return false if at end.

     UBool       push(const RBBIRuleChar &c);        // Push (unget) one character.
                                                     //   Only a single character may be pushed.

     void        parse();                            // Parse the rules, generating two parse
                                                     //   trees, one each for the forward and
                                                     //   reverse rules,
                                                     //   and a list of UnicodeSets encountered.

     /**
      * Return a rules string without unnecessary
      * characters.
      */
     static UnicodeString stripRules(const UnicodeString &rules);
 private:

     UBool       doParseActions(int32_t a);
     void        error(UErrorCode e);                   // error reporting convenience function.
     void        fixOpStack(RBBINode::OpPrecedence p);
                                                        //   a character.
     void        findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL);

     UChar32     nextCharLL();
 #ifdef RBBI_DEBUG
     void        printNodeStack(const char *title);
 #endif
     RBBINode    *pushNewNode(RBBINode::NodeType  t);
     void        scanSet();


     RBBIRuleBuilder               *fRB;              // The rule builder that we are part of.

     int32_t                       fScanIndex;        // Index of current character being processed
                                                      //   in the rule input string.
     int32_t                       fNextIndex;        // Index of the next character, which
                                                      //   is the first character not yet scanned.
     UBool                         fQuoteMode;        // Scan is in a 'quoted region'
     int32_t                       fLineNum;          // Line number in input file.
     int32_t                       fCharNum;          // Char position within the line.
     UChar32                       fLastChar;         // Previous char, needed to count CR-LF
                                                      //   as a single line, not two.

     RBBIRuleChar                  fC;                // Current char for parse state machine
                                                      //   processing.
     UnicodeString                 fVarName;          // $variableName, valid when we've just
                                                      //   scanned one.

     RBBIRuleTableEl               **fStateTable;     // State Transition Table for RBBI Rule
                                                      //   parsing.  index by p[state][char-class]

     uint16_t                      fStack[kStackSize];  // State stack, holds state pushes
     int32_t                       fStackPtr;           //  and pops as specified in the state
                                                        //  transition rules.

     RBBINode                      *fNodeStack[kStackSize]; // Node stack, holds nodes created
                                                            //  during the parse of a rule
     int32_t                        fNodeStackPtr;


     UBool                          fReverseRule;     // True if the rule currently being scanned
                                                      //  is a reverse direction rule (if it
                                                      //  starts with a '!')

     UBool                          fLookAheadRule;   // True if the rule includes a '/'
                                                      //   somewhere within it.

     UBool                          fNoChainInRule;   // True if the current rule starts with a '^'.

     RBBISymbolTable               *fSymbolTable;     // symbol table, holds definitions of
                                                      //   $variable symbols.

     UHashtable                    *fSetTable;        // UnicocodeSet hash table, holds indexes to
                                                      //   the sets created while parsing rules.
                                                      //   The key is the string used for creating
                                                      //   the set.

     UnicodeSet                     fRuleSets[10];    // Unicode Sets that are needed during
                                                      //  the scanning of RBBI rules.  The
                                                      //  indicies for these are assigned by the
                                                      //  perl script that builds the state tables.
                                                      //  See rbbirpt.h.

     int32_t                        fRuleNum;         // Counts each rule as it is scanned.

     int32_t                        fOptionStart;     // Input index of start of a !!option
                                                      //   keyword, while being scanned.

     UnicodeSet *gRuleSet_rule_char;
     UnicodeSet *gRuleSet_white_space;
     UnicodeSet *gRuleSet_name_char;
     UnicodeSet *gRuleSet_name_start_char;

     RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class
     RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class
 };

 U_NAMESPACE_END

 #endif
	// Copyright (C) 2016 and later: Unicode, Inc. and others.
	// License & terms of use: http://www.unicode.org/copyright.html
	//
	// rbbiscan.h
	//
	// Copyright (C) 2002-2016, International Business Machines Corporation and others.
	// All Rights Reserved.
	//
	// This file contains declarations for class RBBIRuleScanner
	//


	#ifndef RBBISCAN_H
	#define RBBISCAN_H

	#include "unicode/utypes.h"
	#include "unicode/uobject.h"
	#include "unicode/rbbi.h"
	#include "unicode/uniset.h"
	#include "unicode/parseerr.h"
	#include "uhash.h"
	#include "uvector.h"
	#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
	// looks up references to $variables within a set.
	#include "rbbinode.h"
	#include "rbbirpt.h"

	U_NAMESPACE_BEGIN

	class RBBIRuleBuilder;
	class RBBISymbolTable;


	//--------------------------------------------------------------------------------
	//
	// class RBBIRuleScanner does the lowest level, character-at-a-time
	// scanning of break iterator rules.
	//
	// The output of the scanner is parse trees for
	// the rule expressions and a list of all Unicode Sets
	// encountered.
	//
	//--------------------------------------------------------------------------------

	class RBBIRuleScanner : public UMemory {
	public:

	enum {
	kStackSize = 100 // The size of the state stack for
	}; // rules parsing. Corresponds roughly
	// to the depth of parentheses nesting
	// that is allowed in the rules.

	struct RBBIRuleChar {
	UChar32 fChar;
	UBool fEscaped;
	RBBIRuleChar() : fChar(0), fEscaped(FALSE) {};
	};

	RBBIRuleScanner(RBBIRuleBuilder *rb);


	virtual ~RBBIRuleScanner();

	void nextChar(RBBIRuleChar &c); // Get the next char from the input stream.
	// Return false if at end.

	UBool push(const RBBIRuleChar &c); // Push (unget) one character.
	// Only a single character may be pushed.

	void parse(); // Parse the rules, generating two parse
	// trees, one each for the forward and
	// reverse rules,
	// and a list of UnicodeSets encountered.

	/**
	* Return a rules string without unnecessary
	* characters.
	*/
	static UnicodeString stripRules(const UnicodeString &rules);
	private:

	UBool doParseActions(int32_t a);
	void error(UErrorCode e); // error reporting convenience function.
	void fixOpStack(RBBINode::OpPrecedence p);
	// a character.
	void findSetFor(const UnicodeString &s, RBBINode node, UnicodeSet setToAdopt = NULL);

	UChar32 nextCharLL();
	#ifdef RBBI_DEBUG
	void printNodeStack(const char *title);
	#endif
	RBBINode *pushNewNode(RBBINode::NodeType t);
	void scanSet();


	RBBIRuleBuilder *fRB; // The rule builder that we are part of.

	int32_t fScanIndex; // Index of current character being processed
	// in the rule input string.
	int32_t fNextIndex; // Index of the next character, which
	// is the first character not yet scanned.
	UBool fQuoteMode; // Scan is in a 'quoted region'
	int32_t fLineNum; // Line number in input file.
	int32_t fCharNum; // Char position within the line.
	UChar32 fLastChar; // Previous char, needed to count CR-LF
	// as a single line, not two.

	RBBIRuleChar fC; // Current char for parse state machine
	// processing.
	UnicodeString fVarName; // $variableName, valid when we've just
	// scanned one.

	RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule
	// parsing. index by p[state][char-class]

	uint16_t fStack[kStackSize]; // State stack, holds state pushes
	int32_t fStackPtr; // and pops as specified in the state
	// transition rules.

	RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created
	// during the parse of a rule
	int32_t fNodeStackPtr;


	UBool fReverseRule; // True if the rule currently being scanned
	// is a reverse direction rule (if it
	// starts with a '!')

	UBool fLookAheadRule; // True if the rule includes a '/'
	// somewhere within it.

	UBool fNoChainInRule; // True if the current rule starts with a '^'.

	RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of
	// $variable symbols.

	UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to
	// the sets created while parsing rules.
	// The key is the string used for creating
	// the set.

	UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during
	// the scanning of RBBI rules. The
	// indicies for these are assigned by the
	// perl script that builds the state tables.
	// See rbbirpt.h.

	int32_t fRuleNum; // Counts each rule as it is scanned.

	int32_t fOptionStart; // Input index of start of a !!option
	// keyword, while being scanned.

	UnicodeSet *gRuleSet_rule_char;
	UnicodeSet *gRuleSet_white_space;
	UnicodeSet *gRuleSet_name_char;
	UnicodeSet *gRuleSet_name_start_char;

	RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class
	RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class
	};

	U_NAMESPACE_END

	#endif