icu4c/source/common/dictionarydata.cpp - external/github.com/unicode-org/icu - Git at Google

 // © 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html
 /*
 *******************************************************************************
 * Copyright (C) 2014-2016, International Business Machines
 * Corporation and others.  All Rights Reserved.
 *******************************************************************************
 * dictionarydata.h
 *
 * created on: 2012may31
 * created by: Markus W. Scherer & Maxime Serrano
 */

 #include "dictionarydata.h"
 #include "unicode/ucharstrie.h"
 #include "unicode/bytestrie.h"
 #include "unicode/udata.h"
 #include "cmemory.h"

 #if !UCONFIG_NO_BREAK_ITERATION

 U_NAMESPACE_BEGIN

 const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0;
 const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1;
 const int32_t  DictionaryData::TRIE_TYPE_MASK = 7;
 const int32_t  DictionaryData::TRIE_HAS_VALUES = 8;

 const int32_t  DictionaryData::TRANSFORM_NONE = 0;
 const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
 const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
 const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;

 DictionaryMatcher::~DictionaryMatcher() {
 }

 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
     udata_close(file);
 }

 int32_t UCharsDictionaryMatcher::getType() const {
     return DictionaryData::TRIE_TYPE_UCHARS;
 }

 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
                             int32_t *prefix) const {

     UCharsTrie uct(characters);
     int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
     int32_t wordCount = 0;
     int32_t codePointsMatched = 0;

     for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
         UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
         int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
         codePointsMatched += 1;
         if (USTRINGTRIE_HAS_VALUE(result)) {
             if (wordCount < limit) {
                 if (values != NULL) {
                     values[wordCount] = uct.getValue();
                 }
                 if (lengths != NULL) {
                     lengths[wordCount] = lengthMatched;
                 }
                 if (cpLengths != NULL) {
                     cpLengths[wordCount] = codePointsMatched;
                 }
                 ++wordCount;
             }
             if (result == USTRINGTRIE_FINAL_VALUE) {
                 break;
             }
         }
         else if (result == USTRINGTRIE_NO_MATCH) {
             break;
         }
         if (lengthMatched >= maxLength) {
             break;
         }
     }

     if (prefix != NULL) {
         *prefix = codePointsMatched;
     }
     return wordCount;
 }

 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
     udata_close(file);
 }

 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
     if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
         if (c == 0x200D) {
             return 0xFF;
         } else if (c == 0x200C) {
             return 0xFE;
         }
         int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
         if (delta < 0 || 0xFD < delta) {
             return U_SENTINEL;
         }
         return (UChar32)delta;
     }
     return c;
 }

 int32_t BytesDictionaryMatcher::getType() const {
     return DictionaryData::TRIE_TYPE_BYTES;
 }

 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
                             int32_t *prefix) const {
     BytesTrie bt(characters);
     int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
     int32_t wordCount = 0;
     int32_t codePointsMatched = 0;

     for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
         UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
         int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
         codePointsMatched += 1;
         if (USTRINGTRIE_HAS_VALUE(result)) {
             if (wordCount < limit) {
                 if (values != NULL) {
                     values[wordCount] = bt.getValue();
                 }
                 if (lengths != NULL) {
                     lengths[wordCount] = lengthMatched;
                 }
                 if (cpLengths != NULL) {
                     cpLengths[wordCount] = codePointsMatched;
                 }
                 ++wordCount;
             }
             if (result == USTRINGTRIE_FINAL_VALUE) {
                 break;
             }
         }
         else if (result == USTRINGTRIE_NO_MATCH) {
             break;
         }
         if (lengthMatched >= maxLength) {
             break;
         }
     }

     if (prefix != NULL) {
         *prefix = codePointsMatched;
     }
     return wordCount;
 }


 U_NAMESPACE_END

 U_NAMESPACE_USE

 U_CAPI int32_t U_EXPORT2
 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
            void *outData, UErrorCode *pErrorCode) {
     const UDataInfo *pInfo;
     int32_t headerSize;
     const uint8_t *inBytes;
     uint8_t *outBytes;
     const int32_t *inIndexes;
     int32_t indexes[DictionaryData::IX_COUNT];
     int32_t i, offset, size;

     headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
     if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
     pInfo = (const UDataInfo *)((const char *)inData + 4);
     if (!(pInfo->dataFormat[0] == 0x44 &&
           pInfo->dataFormat[1] == 0x69 &&
           pInfo->dataFormat[2] == 0x63 &&
           pInfo->dataFormat[3] == 0x74 &&
           pInfo->formatVersion[0] == 1)) {
         udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
                          pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
         *pErrorCode = U_UNSUPPORTED_ERROR;
         return 0;
     }

     inBytes = (const uint8_t *)inData + headerSize;
     outBytes = (uint8_t *)outData + headerSize;

     inIndexes = (const int32_t *)inBytes;
     if (length >= 0) {
         length -= headerSize;
         if (length < (int32_t)(sizeof(indexes))) {
             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
             return 0;
         }
     }

     for (i = 0; i < DictionaryData::IX_COUNT; i++) {
         indexes[i] = udata_readInt32(ds, inIndexes[i]);
     }

     size = indexes[DictionaryData::IX_TOTAL_SIZE];

     if (length >= 0) {
         if (length < size) {
             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
             return 0;
         }

         if (inBytes != outBytes) {
             uprv_memcpy(outBytes, inBytes, size);
         }

         offset = 0;
         ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
         offset = (int32_t)sizeof(indexes);
         int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
         int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];

         if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
             ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
         } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
             // nothing to do
         } else {
             udata_printError(ds, "udict_swap(): unknown trie type!\n");
             *pErrorCode = U_UNSUPPORTED_ERROR;
             return 0;
         }

         // these next two sections are empty in the current format,
         // but may be used later.
         offset = nextOffset;
         nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
         offset = nextOffset;
         nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
         offset = nextOffset;
     }
     return headerSize + size;
 }
 #endif
	// © 2016 and later: Unicode, Inc. and others.
	// License & terms of use: http://www.unicode.org/copyright.html
	/*
	*******************************************************************************
	* Copyright (C) 2014-2016, International Business Machines
	* Corporation and others. All Rights Reserved.
	*******************************************************************************
	* dictionarydata.h
	*
	* created on: 2012may31
	* created by: Markus W. Scherer & Maxime Serrano
	*/

	#include "dictionarydata.h"
	#include "unicode/ucharstrie.h"
	#include "unicode/bytestrie.h"
	#include "unicode/udata.h"
	#include "cmemory.h"

	#if !UCONFIG_NO_BREAK_ITERATION

	U_NAMESPACE_BEGIN

	const int32_t DictionaryData::TRIE_TYPE_BYTES = 0;
	const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1;
	const int32_t DictionaryData::TRIE_TYPE_MASK = 7;
	const int32_t DictionaryData::TRIE_HAS_VALUES = 8;

	const int32_t DictionaryData::TRANSFORM_NONE = 0;
	const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
	const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
	const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;

	DictionaryMatcher::~DictionaryMatcher() {
	}

	UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
	udata_close(file);
	}

	int32_t UCharsDictionaryMatcher::getType() const {
	return DictionaryData::TRIE_TYPE_UCHARS;
	}

	int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
	int32_t lengths, int32_t cpLengths, int32_t *values,
	int32_t *prefix) const {

	UCharsTrie uct(characters);
	int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
	int32_t wordCount = 0;
	int32_t codePointsMatched = 0;

	for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
	UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
	int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
	codePointsMatched += 1;
	if (USTRINGTRIE_HAS_VALUE(result)) {
	if (wordCount < limit) {
	if (values != NULL) {
	values[wordCount] = uct.getValue();
	}
	if (lengths != NULL) {
	lengths[wordCount] = lengthMatched;
	}
	if (cpLengths != NULL) {
	cpLengths[wordCount] = codePointsMatched;
	}
	++wordCount;
	}
	if (result == USTRINGTRIE_FINAL_VALUE) {
	break;
	}
	}
	else if (result == USTRINGTRIE_NO_MATCH) {
	break;
	}
	if (lengthMatched >= maxLength) {
	break;
	}
	}

	if (prefix != NULL) {
	*prefix = codePointsMatched;
	}
	return wordCount;
	}

	BytesDictionaryMatcher::~BytesDictionaryMatcher() {
	udata_close(file);
	}

	UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
	if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
	if (c == 0x200D) {
	return 0xFF;
	} else if (c == 0x200C) {
	return 0xFE;
	}
	int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
	if (delta < 0 \|\| 0xFD < delta) {
	return U_SENTINEL;
	}
	return (UChar32)delta;
	}
	return c;
	}

	int32_t BytesDictionaryMatcher::getType() const {
	return DictionaryData::TRIE_TYPE_BYTES;
	}

	int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
	int32_t lengths, int32_t cpLengths, int32_t *values,
	int32_t *prefix) const {
	BytesTrie bt(characters);
	int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
	int32_t wordCount = 0;
	int32_t codePointsMatched = 0;

	for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
	UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
	int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
	codePointsMatched += 1;
	if (USTRINGTRIE_HAS_VALUE(result)) {
	if (wordCount < limit) {
	if (values != NULL) {
	values[wordCount] = bt.getValue();
	}
	if (lengths != NULL) {
	lengths[wordCount] = lengthMatched;
	}
	if (cpLengths != NULL) {
	cpLengths[wordCount] = codePointsMatched;
	}
	++wordCount;
	}
	if (result == USTRINGTRIE_FINAL_VALUE) {
	break;
	}
	}
	else if (result == USTRINGTRIE_NO_MATCH) {
	break;
	}
	if (lengthMatched >= maxLength) {
	break;
	}
	}

	if (prefix != NULL) {
	*prefix = codePointsMatched;
	}
	return wordCount;
	}


	U_NAMESPACE_END

	U_NAMESPACE_USE

	U_CAPI int32_t U_EXPORT2
	udict_swap(const UDataSwapper ds, const void inData, int32_t length,
	void outData, UErrorCode pErrorCode) {
	const UDataInfo *pInfo;
	int32_t headerSize;
	const uint8_t *inBytes;
	uint8_t *outBytes;
	const int32_t *inIndexes;
	int32_t indexes[DictionaryData::IX_COUNT];
	int32_t i, offset, size;

	headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
	if (pErrorCode == NULL \|\| U_FAILURE(*pErrorCode)) return 0;
	pInfo = (const UDataInfo )((const char )inData + 4);
	if (!(pInfo->dataFormat[0] == 0x44 &&
	pInfo->dataFormat[1] == 0x69 &&
	pInfo->dataFormat[2] == 0x63 &&
	pInfo->dataFormat[3] == 0x74 &&
	pInfo->formatVersion[0] == 1)) {
	udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
	pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
	*pErrorCode = U_UNSUPPORTED_ERROR;
	return 0;
	}

	inBytes = (const uint8_t *)inData + headerSize;
	outBytes = (uint8_t *)outData + headerSize;

	inIndexes = (const int32_t *)inBytes;
	if (length >= 0) {
	length -= headerSize;
	if (length < (int32_t)(sizeof(indexes))) {
	udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
	*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
	return 0;
	}
	}

	for (i = 0; i < DictionaryData::IX_COUNT; i++) {
	indexes[i] = udata_readInt32(ds, inIndexes[i]);
	}

	size = indexes[DictionaryData::IX_TOTAL_SIZE];

	if (length >= 0) {
	if (length < size) {
	udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
	*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
	return 0;
	}

	if (inBytes != outBytes) {
	uprv_memcpy(outBytes, inBytes, size);
	}

	offset = 0;
	ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
	offset = (int32_t)sizeof(indexes);
	int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
	int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];

	if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
	ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
	} else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
	// nothing to do
	} else {
	udata_printError(ds, "udict_swap(): unknown trie type!\n");
	*pErrorCode = U_UNSUPPORTED_ERROR;
	return 0;
	}

	// these next two sections are empty in the current format,
	// but may be used later.
	offset = nextOffset;
	nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
	offset = nextOffset;
	nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
	offset = nextOffset;
	}
	return headerSize + size;
	}
	#endif