blob: 6c0f60b4dc71a3dfd939730d236fc8e344621c56 [file] [log] [blame]
/******************************************************************************
* Copyright © {1996-1999}, International Business Machines Corporation and others. All Rights Reserved.
******************************************************************************
*/
//=============================================================================
//
// File mergecol.h
//
//
//
// Created by: Helena Shih
//
// Modification History:
//
// Date Name Description
// 3/5/97 mark Cleaned up fixEntry(). Added constants BYTEPOWER
// and BYTEMASK to replace BYTESIZE.
// 6/17/97 helena In getPattern, added the queue-up list for entries
// with the same extension chars.
// 8/18/97 helena Added internal API documentation.
// 8/13/98 erm Synched up with 1.2 version of MergeCollation.java
// 04/23/99 stephen Removed EDecompositionMode, merged with
// Normalizer::EMode
//=============================================================================
#ifndef MERGECOL_H
#define MERGECOL_H
#include "unicode/unistr.h"
#include "ptnentry.h"
#include "tables.h"
#include "unicode/coll.h"
#include "unicode/normlzr.h"
/**
* Utility class for normalizing and merging patterns for collation.
* Patterns are strings of the form <entry>*, where <entry> has the
* form:
* <pre>
* <pattern> := <entry>*
* <entry> := <separator><chars>{"/"<extension>}
* <separator> := "=", ",", ";", "<", "&"
* <chars>, and <extension> are both arbitrary strings.
* </pre>
* <P>Unquoted whitespaces are ignored.
* 'xxx' can be used to quote characters.
* <P>
* One difference from Collation is that & is used to reset to a current
* point. Or, in other words, it introduces a new sequence which is to
* be added to the old.
* <P>
* That is: "a < b < c < d" is the same as "a < b & b < c & c < d" OR
* "a < b < d & b < c"
* XXX: make '' be a single quote.
* @see PatternEntry
* @version 1.4 1/7/97
* @author Mark Davis, Helena Shih
*/
class MergeCollation
{
public:
/**
* Creates a merged collation table from a pattern string.
* @param pattern the pattern string.
* @param status the error code status. If the input pattern is incorrect,
* this will be set to U_INVALID_FORMAT_ERROR.
*/
MergeCollation( const UnicodeString& pattern,
Normalizer::EMode decompMode,
UErrorCode& success);
/**
* Copy constructor.
*/
MergeCollation( const MergeCollation& other);
/**
* Destructor.
*/
~MergeCollation();
/** Assignment operator
*/
const MergeCollation& operator=(const MergeCollation& other);
/**
* Recovers current pattern from this merged collation object.
* @param pattern the result buffer.
* @return the recovered result.
*/
UnicodeString& getPattern(UnicodeString& pattern) const;
/**
* Recovers current pattern with white spaces.
* @param pattern the result buffer.
* @param withWhiteSpace puts spacing around the entries, and \n
* before & and <
* @return the recovered result.
*/
UnicodeString& getPattern(UnicodeString& pattern, UBool withWhiteSpace) const;
/**
* Emits the pattern for collation builder.
* @param pattern the result buffer.
* @return Emits the string in the format understable to the collation
* builder.
*/
UnicodeString& emitPattern(UnicodeString& pattern) const;
/**
* Emits the pattern for collation builder.
* @param pattern the result buffer.
* @param withWhiteSpace puts spacing around the entries, and \n
* before & and <
* @return Emits the string in the format understable to the collation
* builder.
*/
UnicodeString& emitPattern(UnicodeString& pattern, UBool withWhiteSpace) const;
/**
* Sets the pattern.
* @param pattern string.
* @param status the error code status, it will be set to U_INVALID_FORMAT_ERROR
* if the pattern is incorrect.
*/
void setPattern(const UnicodeString& pattern,
Normalizer::EMode decompMode,
UErrorCode& status);
/**
* Adds a pattern to the current merge collation object.
* @param pattern the new pattern to be added.
* @param status the error code status, it will be set to U_INVALID_FORMAT_ERROR
* if the pattern is incorrect.
*/
void addPattern(const UnicodeString& pattern,
Normalizer::EMode decompMode,
UErrorCode& status);
/**
* Gets count of separate entries in the merge collation object.
* @return the number of pattern entries
*/
int32_t getCount(void) const;
/**
* Gets the specified pattern entry out of the merge collation object.
* @param index the offset of the desired pattern entry
* @return the requested pattern entry
*/
const PatternEntry* getItemAt(UTextOffset index) const;
private:
//============================================================
// privates
//============================================================
VectorOfPointersToPatternEntry* patterns; // a vector of PatternEntries
static const int32_t BITARRAYSIZE;
static const uint8_t BITARRAYMASK;
static const int32_t BYTEPOWER;
static const int32_t BYTEMASK;
PatternEntry* lastEntry;
PatternEntry* saveEntry;
uint8_t* statusArray;
/**
* Finds the last pattern entry before the specified offset that does not have
* extension chars.
* @param i the offset.
* @return the pattern entry.
*/
const PatternEntry* findLastWithNoExtension(int32_t i) const;
/**
* Fixes the new pattern entry in the merge collation table.
* If the strength is RESET, then just change the lastEntry to
* be the current. (If the current is not in patterns, signal an error).
* If not, then remove the current entry, and add it after lastEntry
* (which is usually at the end). Strength indicates the text order
* weight for an entry.
* @param newEntry the new pattern entry
* @param status the error code status, it will be set to U_INVALID_FORMAT_ERROR
* if the strength is RESET and a previous entry can't be found.
*/
void fixEntry( PatternEntry* newEntry,
UErrorCode& status);
/**
* Finds the offset of the specified entry that was previously installed in the
* merge collation object.
* @param lastEntry the entry that was previously installed.
* @param excess the extra characters
* @param status the error code status, it will be set to U_INVALID_FORMAT_ERROR
* if the strength is RESET and a previous entry can't be found.
* @return the offset of the found entry
*/
int32_t findLastEntry( const PatternEntry* lastPatEntry,
UnicodeString& excess,
UErrorCode& success) const;
};
inline UnicodeString& MergeCollation::getPattern(UnicodeString& result) const
{
return getPattern(result, TRUE);
}
inline UnicodeString& MergeCollation::emitPattern(UnicodeString& result) const
{
return emitPattern(result, TRUE);
}
#endif // _MERGECOL