icu4c/source/common/rbbirb.cpp - external/github.com/unicode-org/icu - Git at Google

 // © 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html
 //
 //  file:  rbbirb.cpp
 //
 //  Copyright (C) 2002-2011, International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
 //  This file contains the RBBIRuleBuilder class implementation.  This is the main class for
 //    building (compiling) break rules into the tables required by the runtime
 //    RBBI engine.
 //

 #include "unicode/utypes.h"

 #if !UCONFIG_NO_BREAK_ITERATION

 #include "unicode/brkiter.h"
 #include "unicode/rbbi.h"
 #include "unicode/ubrk.h"
 #include "unicode/unistr.h"
 #include "unicode/uniset.h"
 #include "unicode/uchar.h"
 #include "unicode/uchriter.h"
 #include "unicode/ustring.h"
 #include "unicode/parsepos.h"
 #include "unicode/parseerr.h"

 #include "cmemory.h"
 #include "cstring.h"
 #include "rbbirb.h"
 #include "rbbinode.h"
 #include "rbbiscan.h"
 #include "rbbisetb.h"
 #include "rbbitblb.h"
 #include "rbbidata.h"
 #include "uassert.h"


 U_NAMESPACE_BEGIN


 //----------------------------------------------------------------------------------------
 //
 //  Constructor.
 //
 //----------------------------------------------------------------------------------------
 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
                                        UParseError     *parseErr,
                                        UErrorCode      &status)
  : fRules(rules), fStrippedRules(rules)
 {
     fStatus = &status; // status is checked below
     fParseError = parseErr;
     fDebugEnv   = NULL;
 #ifdef RBBI_DEBUG
     fDebugEnv   = getenv("U_RBBIDEBUG");
 #endif


     fForwardTree        = NULL;
     fReverseTree        = NULL;
     fSafeFwdTree        = NULL;
     fSafeRevTree        = NULL;
     fDefaultTree        = &fForwardTree;
     fForwardTable       = NULL;
     fRuleStatusVals     = NULL;
     fChainRules         = FALSE;
     fLBCMNoChain        = FALSE;
     fLookAheadHardBreak = FALSE;
     fUSetNodes          = NULL;
     fRuleStatusVals     = NULL;
     fScanner            = NULL;
     fSetBuilder         = NULL;
     if (parseErr) {
         uprv_memset(parseErr, 0, sizeof(UParseError));
     }

     if (U_FAILURE(status)) {
         return;
     }

     fUSetNodes          = new UVector(status); // bcos status gets overwritten here
     fRuleStatusVals     = new UVector(status);
     fScanner            = new RBBIRuleScanner(this);
     fSetBuilder         = new RBBISetBuilder(this);
     if (U_FAILURE(status)) {
         return;
     }
     if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
         status = U_MEMORY_ALLOCATION_ERROR;
     }
 }


 //----------------------------------------------------------------------------------------
 //
 //  Destructor
 //
 //----------------------------------------------------------------------------------------
 RBBIRuleBuilder::~RBBIRuleBuilder() {

     int        i;
     for (i=0; ; i++) {
         RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
         if (n==NULL) {
             break;
         }
         delete n;
     }

     delete fUSetNodes;
     delete fSetBuilder;
     delete fForwardTable;
     delete fForwardTree;
     delete fReverseTree;
     delete fSafeFwdTree;
     delete fSafeRevTree;
     delete fScanner;
     delete fRuleStatusVals;
 }


 //----------------------------------------------------------------------------------------
 //
 //   flattenData() -  Collect up the compiled RBBI rule data and put it into
 //                    the format for saving in ICU data files,
 //                    which is also the format needed by the RBBI runtime engine.
 //
 //----------------------------------------------------------------------------------------
 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}

 RBBIDataHeader *RBBIRuleBuilder::flattenData() {
     int32_t    i;

     if (U_FAILURE(*fStatus)) {
         return NULL;
     }

     // Remove whitespace from the rules to make it smaller.
     // The rule parser has already removed comments.
     fStrippedRules = fScanner->stripRules(fStrippedRules);

     // Calculate the size of each section in the data.
     //   Sizes here are padded up to a multiple of 8 for better memory alignment.
     //   Sections sizes actually stored in the header are for the actual data
     //     without the padding.
     //
     int32_t headerSize        = align8(sizeof(RBBIDataHeader));
     int32_t forwardTableSize  = align8(fForwardTable->getTableSize());
     int32_t reverseTableSize  = align8(fForwardTable->getSafeTableSize());
     int32_t trieSize          = align8(fSetBuilder->getTrieSize());
     int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));

     int32_t rulesLengthInUTF8 = 0;
     u_strToUTF8WithSub(0, 0, &rulesLengthInUTF8,
                        fStrippedRules.getBuffer(), fStrippedRules.length(),
                        0xfffd, nullptr, fStatus);
     *fStatus = U_ZERO_ERROR;

     int32_t rulesSize         = align8((rulesLengthInUTF8+1));

     int32_t         totalSize = headerSize
                                 + forwardTableSize
                                 + reverseTableSize
                                 + statusTableSize + trieSize + rulesSize;

 #ifdef RBBI_DEBUG
     if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) {
         RBBIDebugPrintf("Header Size:        %8d\n", headerSize);
         RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize);
         RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize);
         RBBIDebugPrintf("Trie Size:          %8d\n", trieSize);
         RBBIDebugPrintf("Status Table Size:  %8d\n", statusTableSize);
         RBBIDebugPrintf("Rules Size:         %8d\n", rulesSize);
         RBBIDebugPrintf("-----------------------------\n");
         RBBIDebugPrintf("Total Size:         %8d\n", totalSize);
     }
 #endif

     RBBIDataHeader  *data     = (RBBIDataHeader *)uprv_malloc(totalSize);
     if (data == NULL) {
         *fStatus = U_MEMORY_ALLOCATION_ERROR;
         return NULL;
     }
     uprv_memset(data, 0, totalSize);


     data->fMagic            = 0xb1a0;
     data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
     data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
     data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
     data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
     data->fLength           = totalSize;
     data->fCatCount         = fSetBuilder->getNumCharCategories();

     data->fFTable        = headerSize;
     data->fFTableLen     = forwardTableSize;

     data->fRTable        = data->fFTable  + data->fFTableLen;
     data->fRTableLen     = reverseTableSize;

     data->fTrie          = data->fRTable + data->fRTableLen;
     data->fTrieLen       = trieSize;
     data->fStatusTable   = data->fTrie    + data->fTrieLen;
     data->fStatusTableLen= statusTableSize;
     data->fRuleSource    = data->fStatusTable + statusTableSize;
     data->fRuleSourceLen = rulesLengthInUTF8;

     uprv_memset(data->fReserved, 0, sizeof(data->fReserved));

     fForwardTable->exportTable((uint8_t *)data + data->fFTable);
     fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable);
     fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);

     int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
     for (i=0; i<fRuleStatusVals->size(); i++) {
         ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
     }

     u_strToUTF8WithSub((char *)data+data->fRuleSource, rulesSize, &rulesLengthInUTF8,
                        fStrippedRules.getBuffer(), fStrippedRules.length(),
                        0xfffd, nullptr, fStatus);
     if (U_FAILURE(*fStatus)) {
         return NULL;
     }

     return data;
 }


 //----------------------------------------------------------------------------------------
 //
 //  createRuleBasedBreakIterator    construct from source rules that are passed in
 //                                  in a UnicodeString
 //
 //----------------------------------------------------------------------------------------
 BreakIterator *
 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
                                     UParseError      *parseError,
                                     UErrorCode       &status)
 {
     //
     // Read the input rules, generate a parse tree, symbol table,
     // and list of all Unicode Sets referenced by the rules.
     //
     RBBIRuleBuilder  builder(rules, parseError, status);
     if (U_FAILURE(status)) { // status checked here bcos build below doesn't
         return NULL;
     }

     RBBIDataHeader *data = builder.build(status);

     if (U_FAILURE(status)) {
         return nullptr;
     }

     //
     //  Create a break iterator from the compiled rules.
     //     (Identical to creation from stored pre-compiled rules)
     //
     // status is checked after init in construction.
     RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
     if (U_FAILURE(status)) {
         delete This;
         This = NULL;
     }
     else if(This == NULL) { // test for NULL
         status = U_MEMORY_ALLOCATION_ERROR;
     }
     return This;
 }

 RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
     if (U_FAILURE(status)) {
         return nullptr;
     }

     fScanner->parse();
     if (U_FAILURE(status)) {
         return nullptr;
     }

     //
     // UnicodeSet processing.
     //    Munge the Unicode Sets to create an initial set of character categories.
     //
     fSetBuilder->buildRanges();

     //
     //   Generate the DFA state transition table.
     //
     fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
     if (fForwardTable == nullptr) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return nullptr;
     }

     fForwardTable->buildForwardTable();

     // State table and character category optimization.
     // Merge equivalent rows and columns.
     // Note that this process alters the initial set of character categories,
     // causing the representation of UnicodeSets in the parse tree to become invalid.

     optimizeTables();
     fForwardTable->buildSafeReverseTable(status);


 #ifdef RBBI_DEBUG
     if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
         fForwardTable->printStates();
         fForwardTable->printRuleStatusTable();
         fForwardTable->printReverseTable();
     }
 #endif

     //    Generate the mapping tables (TRIE) from input code points to
     //    the character categories.
     //
     fSetBuilder->buildTrie();

     //
     //   Package up the compiled data into a memory image
     //      in the run-time format.
     //
     RBBIDataHeader *data = flattenData(); // returns NULL if error
     if (U_FAILURE(status)) {
         return nullptr;
     }
     return data;
 }

 void RBBIRuleBuilder::optimizeTables() {
     bool didSomething;
     do {
         didSomething = false;

         // Begin looking for duplicates with char class 3.
         // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
         // and should not have other categories merged into them.
         IntPair duplPair = {3, 0};
         while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
             fSetBuilder->mergeCategories(duplPair);
             fForwardTable->removeColumn(duplPair.second);
             didSomething = true;
         }

         while (fForwardTable->removeDuplicateStates() > 0) {
             didSomething = true;
         }
     } while (didSomething);
 }

 U_NAMESPACE_END

 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
	// © 2016 and later: Unicode, Inc. and others.
	// License & terms of use: http://www.unicode.org/copyright.html
	//
	// file: rbbirb.cpp
	//
	// Copyright (C) 2002-2011, International Business Machines Corporation and others.
	// All Rights Reserved.
	//
	// This file contains the RBBIRuleBuilder class implementation. This is the main class for
	// building (compiling) break rules into the tables required by the runtime
	// RBBI engine.
	//

	#include "unicode/utypes.h"

	#if !UCONFIG_NO_BREAK_ITERATION

	#include "unicode/brkiter.h"
	#include "unicode/rbbi.h"
	#include "unicode/ubrk.h"
	#include "unicode/unistr.h"
	#include "unicode/uniset.h"
	#include "unicode/uchar.h"
	#include "unicode/uchriter.h"
	#include "unicode/ustring.h"
	#include "unicode/parsepos.h"
	#include "unicode/parseerr.h"

	#include "cmemory.h"
	#include "cstring.h"
	#include "rbbirb.h"
	#include "rbbinode.h"
	#include "rbbiscan.h"
	#include "rbbisetb.h"
	#include "rbbitblb.h"
	#include "rbbidata.h"
	#include "uassert.h"


	U_NAMESPACE_BEGIN


	//----------------------------------------------------------------------------------------
	//
	// Constructor.
	//
	//----------------------------------------------------------------------------------------
	RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
	UParseError *parseErr,
	UErrorCode &status)
	: fRules(rules), fStrippedRules(rules)
	{
	fStatus = &status; // status is checked below
	fParseError = parseErr;
	fDebugEnv = NULL;
	#ifdef RBBI_DEBUG
	fDebugEnv = getenv("U_RBBIDEBUG");
	#endif


	fForwardTree = NULL;
	fReverseTree = NULL;
	fSafeFwdTree = NULL;
	fSafeRevTree = NULL;
	fDefaultTree = &fForwardTree;
	fForwardTable = NULL;
	fRuleStatusVals = NULL;
	fChainRules = FALSE;
	fLBCMNoChain = FALSE;
	fLookAheadHardBreak = FALSE;
	fUSetNodes = NULL;
	fRuleStatusVals = NULL;
	fScanner = NULL;
	fSetBuilder = NULL;
	if (parseErr) {
	uprv_memset(parseErr, 0, sizeof(UParseError));
	}

	if (U_FAILURE(status)) {
	return;
	}

	fUSetNodes = new UVector(status); // bcos status gets overwritten here
	fRuleStatusVals = new UVector(status);
	fScanner = new RBBIRuleScanner(this);
	fSetBuilder = new RBBISetBuilder(this);
	if (U_FAILURE(status)) {
	return;
	}
	if(fSetBuilder == 0 \|\| fScanner == 0 \|\| fUSetNodes == 0 \|\| fRuleStatusVals == 0) {
	status = U_MEMORY_ALLOCATION_ERROR;
	}
	}



	//----------------------------------------------------------------------------------------
	//
	// Destructor
	//
	//----------------------------------------------------------------------------------------
	RBBIRuleBuilder::~RBBIRuleBuilder() {

	int i;
	for (i=0; ; i++) {
	RBBINode n = (RBBINode )fUSetNodes->elementAt(i);
	if (n==NULL) {
	break;
	}
	delete n;
	}

	delete fUSetNodes;
	delete fSetBuilder;
	delete fForwardTable;
	delete fForwardTree;
	delete fReverseTree;
	delete fSafeFwdTree;
	delete fSafeRevTree;
	delete fScanner;
	delete fRuleStatusVals;
	}





	//----------------------------------------------------------------------------------------
	//
	// flattenData() - Collect up the compiled RBBI rule data and put it into
	// the format for saving in ICU data files,
	// which is also the format needed by the RBBI runtime engine.
	//
	//----------------------------------------------------------------------------------------
	static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}

	RBBIDataHeader *RBBIRuleBuilder::flattenData() {
	int32_t i;

	if (U_FAILURE(*fStatus)) {
	return NULL;
	}

	// Remove whitespace from the rules to make it smaller.
	// The rule parser has already removed comments.
	fStrippedRules = fScanner->stripRules(fStrippedRules);

	// Calculate the size of each section in the data.
	// Sizes here are padded up to a multiple of 8 for better memory alignment.
	// Sections sizes actually stored in the header are for the actual data
	// without the padding.
	//
	int32_t headerSize = align8(sizeof(RBBIDataHeader));
	int32_t forwardTableSize = align8(fForwardTable->getTableSize());
	int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize());
	int32_t trieSize = align8(fSetBuilder->getTrieSize());
	int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));

	int32_t rulesLengthInUTF8 = 0;
	u_strToUTF8WithSub(0, 0, &rulesLengthInUTF8,
	fStrippedRules.getBuffer(), fStrippedRules.length(),
	0xfffd, nullptr, fStatus);
	*fStatus = U_ZERO_ERROR;

	int32_t rulesSize = align8((rulesLengthInUTF8+1));

	int32_t totalSize = headerSize
	+ forwardTableSize
	+ reverseTableSize
	+ statusTableSize + trieSize + rulesSize;

	#ifdef RBBI_DEBUG
	if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) {
	RBBIDebugPrintf("Header Size: %8d\n", headerSize);
	RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize);
	RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize);
	RBBIDebugPrintf("Trie Size: %8d\n", trieSize);
	RBBIDebugPrintf("Status Table Size: %8d\n", statusTableSize);
	RBBIDebugPrintf("Rules Size: %8d\n", rulesSize);
	RBBIDebugPrintf("-----------------------------\n");
	RBBIDebugPrintf("Total Size: %8d\n", totalSize);
	}
	#endif

	RBBIDataHeader data = (RBBIDataHeader )uprv_malloc(totalSize);
	if (data == NULL) {
	*fStatus = U_MEMORY_ALLOCATION_ERROR;
	return NULL;
	}
	uprv_memset(data, 0, totalSize);


	data->fMagic = 0xb1a0;
	data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
	data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
	data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
	data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
	data->fLength = totalSize;
	data->fCatCount = fSetBuilder->getNumCharCategories();

	data->fFTable = headerSize;
	data->fFTableLen = forwardTableSize;

	data->fRTable = data->fFTable + data->fFTableLen;
	data->fRTableLen = reverseTableSize;

	data->fTrie = data->fRTable + data->fRTableLen;
	data->fTrieLen = trieSize;
	data->fStatusTable = data->fTrie + data->fTrieLen;
	data->fStatusTableLen= statusTableSize;
	data->fRuleSource = data->fStatusTable + statusTableSize;
	data->fRuleSourceLen = rulesLengthInUTF8;

	uprv_memset(data->fReserved, 0, sizeof(data->fReserved));

	fForwardTable->exportTable((uint8_t *)data + data->fFTable);
	fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable);
	fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);

	int32_t ruleStatusTable = (int32_t )((uint8_t *)data + data->fStatusTable);
	for (i=0; i<fRuleStatusVals->size(); i++) {
	ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
	}

	u_strToUTF8WithSub((char *)data+data->fRuleSource, rulesSize, &rulesLengthInUTF8,
	fStrippedRules.getBuffer(), fStrippedRules.length(),
	0xfffd, nullptr, fStatus);
	if (U_FAILURE(*fStatus)) {
	return NULL;
	}

	return data;
	}


	//----------------------------------------------------------------------------------------
	//
	// createRuleBasedBreakIterator construct from source rules that are passed in
	// in a UnicodeString
	//
	//----------------------------------------------------------------------------------------
	BreakIterator *
	RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
	UParseError *parseError,
	UErrorCode &status)
	{
	//
	// Read the input rules, generate a parse tree, symbol table,
	// and list of all Unicode Sets referenced by the rules.
	//
	RBBIRuleBuilder builder(rules, parseError, status);
	if (U_FAILURE(status)) { // status checked here bcos build below doesn't
	return NULL;
	}

	RBBIDataHeader *data = builder.build(status);

	if (U_FAILURE(status)) {
	return nullptr;
	}

	//
	// Create a break iterator from the compiled rules.
	// (Identical to creation from stored pre-compiled rules)
	//
	// status is checked after init in construction.
	RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
	if (U_FAILURE(status)) {
	delete This;
	This = NULL;
	}
	else if(This == NULL) { // test for NULL
	status = U_MEMORY_ALLOCATION_ERROR;
	}
	return This;
	}

	RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
	if (U_FAILURE(status)) {
	return nullptr;
	}

	fScanner->parse();
	if (U_FAILURE(status)) {
	return nullptr;
	}

	//
	// UnicodeSet processing.
	// Munge the Unicode Sets to create an initial set of character categories.
	//
	fSetBuilder->buildRanges();

	//
	// Generate the DFA state transition table.
	//
	fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
	if (fForwardTable == nullptr) {
	status = U_MEMORY_ALLOCATION_ERROR;
	return nullptr;
	}

	fForwardTable->buildForwardTable();

	// State table and character category optimization.
	// Merge equivalent rows and columns.
	// Note that this process alters the initial set of character categories,
	// causing the representation of UnicodeSets in the parse tree to become invalid.

	optimizeTables();
	fForwardTable->buildSafeReverseTable(status);


	#ifdef RBBI_DEBUG
	if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
	fForwardTable->printStates();
	fForwardTable->printRuleStatusTable();
	fForwardTable->printReverseTable();
	}
	#endif

	// Generate the mapping tables (TRIE) from input code points to
	// the character categories.
	//
	fSetBuilder->buildTrie();

	//
	// Package up the compiled data into a memory image
	// in the run-time format.
	//
	RBBIDataHeader *data = flattenData(); // returns NULL if error
	if (U_FAILURE(status)) {
	return nullptr;
	}
	return data;
	}

	void RBBIRuleBuilder::optimizeTables() {
	bool didSomething;
	do {
	didSomething = false;

	// Begin looking for duplicates with char class 3.
	// Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
	// and should not have other categories merged into them.
	IntPair duplPair = {3, 0};
	while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
	fSetBuilder->mergeCategories(duplPair);
	fForwardTable->removeColumn(duplPair.second);
	didSomething = true;
	}

	while (fForwardTable->removeDuplicateStates() > 0) {
	didSomething = true;
	}
	} while (didSomething);
	}

	U_NAMESPACE_END

	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */