icu4c/source/i18n/usrchimp.h - external/github.com/unicode-org/icu - Git at Google

 // © 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html
 /*
 **********************************************************************
 *   Copyright (C) 2001-2015 IBM and others. All rights reserved.
 **********************************************************************
 *   Date        Name        Description
 *  08/13/2001   synwee      Creation.
 **********************************************************************
 */
 #ifndef USRCHIMP_H
 #define USRCHIMP_H

 #include "unicode/utypes.h"

 #if !UCONFIG_NO_COLLATION

 #include "unicode/normalizer2.h"
 #include "unicode/ucol.h"
 #include "unicode/ucoleitr.h"
 #include "unicode/ubrk.h"

 /* mask off anything but primary order */
 #define UCOL_PRIMARYORDERMASK 0xffff0000
 /* mask off anything but secondary order */
 #define UCOL_SECONDARYORDERMASK 0x0000ff00
 /* mask off anything but tertiary order */
 #define UCOL_TERTIARYORDERMASK 0x000000ff
 /* primary order shift */
 #define UCOL_PRIMARYORDERSHIFT 16
 /* secondary order shift */
 #define UCOL_SECONDARYORDERSHIFT 8

 #define UCOL_IGNORABLE 0

 /* get weights from a CE */
 #define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
 #define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
 #define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)

 #define UCOL_CONTINUATION_MARKER 0xC0

 #define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)

 /**
  * This indicates an error has occured during processing or there are no more CEs
  * to be returned.
  */
 #define UCOL_PROCESSED_NULLORDER        ((int64_t)U_INT64_MAX)

 U_NAMESPACE_BEGIN

 class CollationElementIterator;
 class Collator;

 struct PCEI
 {
     uint64_t ce;
     int32_t  low;
     int32_t  high;
 };

 struct PCEBuffer
 {
     PCEI    defaultBuffer[16];
     PCEI   *buffer;
     int32_t bufferIndex;
     int32_t bufferSize;

     PCEBuffer();
     ~PCEBuffer();

     void  reset();
     UBool isEmpty() const;
     void  put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
     const PCEI *get();
 };

 class UCollationPCE : public UMemory {
 private:
     PCEBuffer          pceBuffer;
     CollationElementIterator *cei;
     UCollationStrength strength;
     UBool              toShift;
     UBool              isShifted;
     uint32_t           variableTop;

 public:
     UCollationPCE(UCollationElements *elems);
     UCollationPCE(CollationElementIterator *iter);
     ~UCollationPCE();

     void init(UCollationElements *elems);
     void init(CollationElementIterator *iter);

     /**
      * Get the processed ordering priority of the next collation element in the text.
      * A single character may contain more than one collation element.
      *
      * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
      * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
      * @param status A pointer to an UErrorCode to receive any errors.
      * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
      *         if an error has occured or if the end of string has been reached
      */
     int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
     /**
      * Get the processed ordering priority of the previous collation element in the text.
      * A single character may contain more than one collation element.
      *
      * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
      * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
      * @param status A pointer to an UErrorCode to receive any errors. Noteably
      *               a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
      *               buffer has been exhausted.
      * @return The previous collation elements ordering, otherwise returns
      *         UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
      *         string has been reached.
      */
     int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);

 private:
     void init(const Collator &coll);
     uint64_t processCE(uint32_t ce);
 };

 U_NAMESPACE_END

 #define INITIAL_ARRAY_SIZE_       256
 #define MAX_TABLE_SIZE_           257

 struct USearch {
     // required since collation element iterator does not have a getText API
     const UChar              *text;
           int32_t             textLength; // exact length
           UBool               isOverlap;
           UBool               isCanonicalMatch;
           int16_t             elementComparisonType;
           UBreakIterator     *internalBreakIter;  //internal character breakiterator
           UBreakIterator     *breakIter;
     // value USEARCH_DONE is the default value
     // if we are not at the start of the text or the end of the text,
     // depending on the iteration direction and matchedIndex is USEARCH_DONE
     // it means that we can't find any more matches in that particular direction
           int32_t             matchedIndex;
           int32_t             matchedLength;
           UBool               isForwardSearching;
           UBool               reset;
 };

 struct UPattern {
     const UChar              *text;
           int32_t             textLength; // exact length
           // length required for backwards ce comparison
           int32_t             cesLength;
           int32_t            *ces;
           int32_t             cesBuffer[INITIAL_ARRAY_SIZE_];
           int32_t             pcesLength;
           int64_t            *pces;
           int64_t             pcesBuffer[INITIAL_ARRAY_SIZE_];
           UBool               hasPrefixAccents;
           UBool               hasSuffixAccents;
           int16_t             defaultShiftSize;
           int16_t             shift[MAX_TABLE_SIZE_];
           int16_t             backShift[MAX_TABLE_SIZE_];
 };

 struct UStringSearch {
     struct USearch            *search;
     struct UPattern            pattern;
     const  UCollator          *collator;
     const  icu::Normalizer2   *nfd;
     // positions within the collation element iterator is used to determine
     // if we are at the start of the text.
            UCollationElements *textIter;
            icu::UCollationPCE *textProcessedIter;
     // utility collation element, used throughout program for temporary
     // iteration.
            UCollationElements *utilIter;
            UBool               ownCollator;
            UCollationStrength  strength;
            uint32_t            ceMask;
            uint32_t            variableTop;
            UBool               toShift;
            UChar               canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
            UChar               canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
 };

 /**
 * Exact matches without checking for the ends for extra accents.
 * The match after the position within the collation element iterator is to be
 * found.
 * After a match is found the offset in the collation element iterator will be
 * shifted to the start of the match.
 * Implementation note:
 * For tertiary we can't use the collator->tertiaryMask, that is a
 * preprocessed mask that takes into account case options. since we are only
 * concerned with exact matches, we don't need that.
 * Alternate handling - since only the 16 most significant digits is only used,
 * we can safely do a compare without masking if the ce is a variable, we mask
 * and get only the primary values no shifting to quartenary is required since
 * all primary values less than variabletop will need to be masked off anyway.
 * If the end character is composite and the pattern ce does not match the text
 * ce, we skip it until we find a match in the end composite character or when
 * it has passed the character. This is so that we can match pattern "a" with
 * the text "\u00e6"
 * @param strsrch string search data
 * @param status error status if any
 * @return true if an exact match is found, false otherwise
 */
 U_CFUNC
 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);

 /**
 * Canonical matches.
 * According to the definition, matches found here will include the whole span
 * of beginning and ending accents if it overlaps that region.
 * @param strsrch string search data
 * @param status error status if any
 * @return true if a canonical match is found, false otherwise
 */
 U_CFUNC
 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);

 /**
 * Gets the previous match.
 * Comments follows from handleNextExact
 * @param strsrch string search data
 * @param status error status if any
 * @return True if a exact math is found, false otherwise.
 */
 U_CFUNC
 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);

 /**
 * Canonical matches.
 * According to the definition, matches found here will include the whole span
 * of beginning and ending accents if it overlaps that region.
 * @param strsrch string search data
 * @param status error status if any
 * @return true if a canonical match is found, false otherwise
 */
 U_CFUNC
 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
                                       UErrorCode    *status);

 #endif /* #if !UCONFIG_NO_COLLATION */

 #endif
	// © 2016 and later: Unicode, Inc. and others.
	// License & terms of use: http://www.unicode.org/copyright.html
	/*
	**********************************************************************
	* Copyright (C) 2001-2015 IBM and others. All rights reserved.
	**********************************************************************
	* Date Name Description
	* 08/13/2001 synwee Creation.
	**********************************************************************
	*/
	#ifndef USRCHIMP_H
	#define USRCHIMP_H

	#include "unicode/utypes.h"

	#if !UCONFIG_NO_COLLATION

	#include "unicode/normalizer2.h"
	#include "unicode/ucol.h"
	#include "unicode/ucoleitr.h"
	#include "unicode/ubrk.h"

	/* mask off anything but primary order */
	#define UCOL_PRIMARYORDERMASK 0xffff0000
	/* mask off anything but secondary order */
	#define UCOL_SECONDARYORDERMASK 0x0000ff00
	/* mask off anything but tertiary order */
	#define UCOL_TERTIARYORDERMASK 0x000000ff
	/* primary order shift */
	#define UCOL_PRIMARYORDERSHIFT 16
	/* secondary order shift */
	#define UCOL_SECONDARYORDERSHIFT 8

	#define UCOL_IGNORABLE 0

	/* get weights from a CE */
	#define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
	#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
	#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)

	#define UCOL_CONTINUATION_MARKER 0xC0

	#define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)

	/**
	* This indicates an error has occured during processing or there are no more CEs
	* to be returned.
	*/
	#define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX)

	U_NAMESPACE_BEGIN

	class CollationElementIterator;
	class Collator;

	struct PCEI
	{
	uint64_t ce;
	int32_t low;
	int32_t high;
	};

	struct PCEBuffer
	{
	PCEI defaultBuffer[16];
	PCEI *buffer;
	int32_t bufferIndex;
	int32_t bufferSize;

	PCEBuffer();
	~PCEBuffer();

	void reset();
	UBool isEmpty() const;
	void put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
	const PCEI *get();
	};

	class UCollationPCE : public UMemory {
	private:
	PCEBuffer pceBuffer;
	CollationElementIterator *cei;
	UCollationStrength strength;
	UBool toShift;
	UBool isShifted;
	uint32_t variableTop;

	public:
	UCollationPCE(UCollationElements *elems);
	UCollationPCE(CollationElementIterator *iter);
	~UCollationPCE();

	void init(UCollationElements *elems);
	void init(CollationElementIterator *iter);

	/**
	* Get the processed ordering priority of the next collation element in the text.
	* A single character may contain more than one collation element.
	*
	* @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
	* @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
	* @param status A pointer to an UErrorCode to receive any errors.
	* @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
	* if an error has occured or if the end of string has been reached
	*/
	int64_t nextProcessed(int32_t ixLow, int32_t ixHigh, UErrorCode *status);
	/**
	* Get the processed ordering priority of the previous collation element in the text.
	* A single character may contain more than one collation element.
	*
	* @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
	* @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
	* @param status A pointer to an UErrorCode to receive any errors. Noteably
	* a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
	* buffer has been exhausted.
	* @return The previous collation elements ordering, otherwise returns
	* UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
	* string has been reached.
	*/
	int64_t previousProcessed(int32_t ixLow, int32_t ixHigh, UErrorCode *status);

	private:
	void init(const Collator &coll);
	uint64_t processCE(uint32_t ce);
	};

	U_NAMESPACE_END

	#define INITIAL_ARRAY_SIZE_ 256
	#define MAX_TABLE_SIZE_ 257

	struct USearch {
	// required since collation element iterator does not have a getText API
	const UChar *text;
	int32_t textLength; // exact length
	UBool isOverlap;
	UBool isCanonicalMatch;
	int16_t elementComparisonType;
	UBreakIterator *internalBreakIter; //internal character breakiterator
	UBreakIterator *breakIter;
	// value USEARCH_DONE is the default value
	// if we are not at the start of the text or the end of the text,
	// depending on the iteration direction and matchedIndex is USEARCH_DONE
	// it means that we can't find any more matches in that particular direction
	int32_t matchedIndex;
	int32_t matchedLength;
	UBool isForwardSearching;
	UBool reset;
	};

	struct UPattern {
	const UChar *text;
	int32_t textLength; // exact length
	// length required for backwards ce comparison
	int32_t cesLength;
	int32_t *ces;
	int32_t cesBuffer[INITIAL_ARRAY_SIZE_];
	int32_t pcesLength;
	int64_t *pces;
	int64_t pcesBuffer[INITIAL_ARRAY_SIZE_];
	UBool hasPrefixAccents;
	UBool hasSuffixAccents;
	int16_t defaultShiftSize;
	int16_t shift[MAX_TABLE_SIZE_];
	int16_t backShift[MAX_TABLE_SIZE_];
	};

	struct UStringSearch {
	struct USearch *search;
	struct UPattern pattern;
	const UCollator *collator;
	const icu::Normalizer2 *nfd;
	// positions within the collation element iterator is used to determine
	// if we are at the start of the text.
	UCollationElements *textIter;
	icu::UCollationPCE *textProcessedIter;
	// utility collation element, used throughout program for temporary
	// iteration.
	UCollationElements *utilIter;
	UBool ownCollator;
	UCollationStrength strength;
	uint32_t ceMask;
	uint32_t variableTop;
	UBool toShift;
	UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
	UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
	};

	/**
	* Exact matches without checking for the ends for extra accents.
	* The match after the position within the collation element iterator is to be
	* found.
	* After a match is found the offset in the collation element iterator will be
	* shifted to the start of the match.
	* Implementation note:
	* For tertiary we can't use the collator->tertiaryMask, that is a
	* preprocessed mask that takes into account case options. since we are only
	* concerned with exact matches, we don't need that.
	* Alternate handling - since only the 16 most significant digits is only used,
	* we can safely do a compare without masking if the ce is a variable, we mask
	* and get only the primary values no shifting to quartenary is required since
	* all primary values less than variabletop will need to be masked off anyway.
	* If the end character is composite and the pattern ce does not match the text
	* ce, we skip it until we find a match in the end composite character or when
	* it has passed the character. This is so that we can match pattern "a" with
	* the text "\u00e6"
	* @param strsrch string search data
	* @param status error status if any
	* @return true if an exact match is found, false otherwise
	*/
	U_CFUNC
	UBool usearch_handleNextExact(UStringSearch strsrch, UErrorCode status);

	/**
	* Canonical matches.
	* According to the definition, matches found here will include the whole span
	* of beginning and ending accents if it overlaps that region.
	* @param strsrch string search data
	* @param status error status if any
	* @return true if a canonical match is found, false otherwise
	*/
	U_CFUNC
	UBool usearch_handleNextCanonical(UStringSearch strsrch, UErrorCode status);

	/**
	* Gets the previous match.
	* Comments follows from handleNextExact
	* @param strsrch string search data
	* @param status error status if any
	* @return True if a exact math is found, false otherwise.
	*/
	U_CFUNC
	UBool usearch_handlePreviousExact(UStringSearch strsrch, UErrorCode status);

	/**
	* Canonical matches.
	* According to the definition, matches found here will include the whole span
	* of beginning and ending accents if it overlaps that region.
	* @param strsrch string search data
	* @param status error status if any
	* @return true if a canonical match is found, false otherwise
	*/
	U_CFUNC
	UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
	UErrorCode *status);

	#endif /* #if !UCONFIG_NO_COLLATION */

	#endif