source/i18n/brkdict.cpp - external/github.com/unicode-org/icu - Git at Google

 /*
 **********************************************************************
 *   Copyright (C) 1999-2000 IBM and others. All rights reserved.
 **********************************************************************
 *   Date        Name        Description
 *   12/1/99     rtg         Ported from Java
 *   01/13/2000  helena      Added UErrorCode to ctors.
 **********************************************************************
 */

 #include "brkdict.h"
 #include "cmemory.h"
 #include "unicode/resbund.h"

 //=================================================================================
 // deserialization
 //=================================================================================

 BreakDictionary::BreakDictionary(char* dictionaryFilename, UErrorCode& status)
 {
     if (U_FAILURE(status)) return;

     ResourceBundle th((char *)0, Locale("th"), status);

     if (U_FAILURE(status)) return;

     ResourceBundle th_dict = th.get("BreakDictionaryData", status);
     if (U_FAILURE(status)) return;

     int32_t len;
     const uint8_t * data = th_dict.getBinary(len, status);
     if (U_FAILURE(status)) return;

     UMemoryStream* dictionaryStream = uprv_mstrm_openBuffer(data, len);

     if (dictionaryStream == 0) {
         status = U_FILE_ACCESS_ERROR;
         return;
     }
     readDictionaryFile(dictionaryStream);
     uprv_mstrm_close(dictionaryStream);
 }

 BreakDictionary::~BreakDictionary()
 {
     ucmp8_close(columnMap);
     delete [] table;
     delete [] rowIndex;
     delete [] rowIndexFlags;
     delete [] rowIndexFlagsIndex;
     delete [] rowIndexShifts;
 }

 // macros to support readDictionaryFile.  The data files originated from a Java
 // program, and Java always writes data out in big-endian format.  These macros will
 // byte-swap the data for appropriate use on Windows.

 #if U_IS_BIG_ENDIAN
 #define SWAP32(x)
 #define SWAP16(x)
 #else
 #define SWAP32(x) x = (uint32_t)((x >> 24 & 0xff) | (x >> 8 & 0xff00) | (x << 8 & 0xff0000) | (x << 24 & 0xff000000))
 #define SWAP16(x) x = (uint16_t)((x << 8 & 0xff00) | (x >> 8 & 0xff))
 #endif

 void
 BreakDictionary::readDictionaryFile(UMemoryStream* in)
 {
     int32_t l;
     int32_t version;

     int i;

     // read in the version number (right now we just ignore it)
     uprv_mstrm_read(in, &version, 4);

     // read in the column map (this is serialized in its internal form:
     // an index array followed by a data array)
     uprv_mstrm_read(in, &l, 4);
     SWAP32(l);
     uint16_t* temp = (uint16_t*) uprv_malloc(sizeof(uint16_t)*l);
     uprv_mstrm_read(in, temp, l * sizeof (int16_t) );
     for (i = 0; i < l; i++) {
         SWAP16(temp[i]);
     }
     uprv_mstrm_read(in, &l, 4);
     SWAP32(l);
     int8_t* temp2 = (int8_t*) uprv_malloc(sizeof(int8_t)*l);
     uprv_mstrm_read(in, temp2, l);
     columnMap = ucmp8_openAdopt(temp, temp2, l);

     // read in numCols and numColGroups
     uprv_mstrm_read(in, &numCols, 4);
     SWAP32(numCols);
     uprv_mstrm_read(in, &numColGroups, 4);
     SWAP32(numColGroups);

     // read in the row-number index
     uprv_mstrm_read(in, &l, 4);
     SWAP32(l);
     rowIndex = new int16_t[l];
     uprv_mstrm_read(in, rowIndex, l * sizeof (int16_t) );
     for (i = 0; i < l; i++) {
         SWAP16(rowIndex[i]);
     }

     // load in the populated-cells bitmap: index first, then bitmap list
     uprv_mstrm_read(in, &l, 4);
     SWAP32(l);
     rowIndexFlagsIndex = new int16_t[l];
     uprv_mstrm_read(in, rowIndexFlagsIndex, l * sizeof(int16_t) );
     for (i = 0; i < l; i++) {
         SWAP16(rowIndexFlagsIndex[i]);
     }
     uprv_mstrm_read(in, &l, 4);
     SWAP32(l);
     rowIndexFlags = new int32_t[l];
     uprv_mstrm_read(in, rowIndexFlags, l * sizeof(int32_t));
     for (i = 0; i < l; i++) {
         SWAP32(rowIndexFlags[i]);
     }

     // load in the row-shift index
     uprv_mstrm_read(in, &l, 4);
     SWAP32(l);
     rowIndexShifts = new int8_t[l];
     uprv_mstrm_read(in, rowIndexShifts, l);

     // finally, load in the actual state table
     uprv_mstrm_read(in, &l, 4);
     SWAP32(l);
     table = new int16_t[l];
     uprv_mstrm_read(in, table, l * sizeof(int16_t) );
     for (i = 0; i < l; i++) {
         SWAP16(table[i]);
     }

     // the reverse column map occurs next in the file.  In the C/C++ code, for the
     // time being, we're not going to worry about that.
 }

 //=================================================================================
 // access to the words
 //=================================================================================

 /**
  * Uses the column map to map the character to a column number, then
  * passes the row and column number to the other version of at()
  * @param row The current state
  * @param ch The character whose column we're interested in
  * @return The new state to transition to
  */
 int16_t
 BreakDictionary::at(int32_t row, UChar ch) const
 {
     int16_t col = ucmp8_get(columnMap, ch);
     return at(row, (int32_t)col);
 }

 /**
  * Returns the value in the cell with the specified (logical) row and
  * column numbers.  In DictionaryBasedBreakIterator, the row number is
  * a state number, the column number is an input, and the return value
  * is the row number of the new state to transition to.  (0 is the
  * "error" state, and -1 is the "end of word" state in a dictionary)
  * @param row The row number of the current state
  * @param col The column number of the input character (0 means "not a
  * dictionary character")
  * @return The row number of the new state to transition to
  */
 int16_t
 BreakDictionary::at(int32_t row, int32_t col) const
 {
     if (cellIsPopulated(row, col)) {
         // we map from logical to physical row number by looking up the
         // mapping in rowIndex; we map from logical column number to
         // physical column number by looking up a shift value for this
         // logical row and offsetting the logical column number by
         // the shift amount.  Then we can use internalAt() to actually
         // get the value out of the table.
         return internalAt(rowIndex[row], col + rowIndexShifts[row]);
     }
     else {
         return 0;
     }
 }

 //=================================================================================
 // implementation
 //=================================================================================
 /**
  * Given (logical) row and column numbers, returns true if the
  * cell in that position is populated
  */
 UBool
 BreakDictionary::cellIsPopulated(int32_t row, int32_t col) const
 {
     // look up the entry in the bitmap index for the specified row.
     // If it's a negative number, it's the column number of the only
     // populated cell in the row
     if (rowIndexFlagsIndex[row] < 0) {
         return col == -rowIndexFlagsIndex[row];
     }

     // if it's a positive number, it's the offset of an entry in the bitmap
     // list.  If the table is more than 32 columns wide, the bitmap is stored
     // successive entries in the bitmap list, so we have to divide the column
     // number by 32 and offset the number we got out of the index by the result.
     // Once we have the appropriate piece of the bitmap, test the appropriate
     // bit and return the result.
     else {
         int32_t flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)];
         return (flags & (1 << (col & 0x1f))) != 0;
     }
 }

 /**
  * Implementation of at() when we know the specified cell is populated.
  * @param row The PHYSICAL row number of the cell
  * @param col The PHYSICAL column number of the cell
  * @return The value stored in the cell
  */
 int16_t
 BreakDictionary::internalAt(int32_t row, int32_t col) const
 {
     // the table is a one-dimensional array, so this just does the math necessary
     // to treat it as a two-dimensional array (we don't just use a two-dimensional
     // array because two-dimensional arrays are inefficient in Java)
     return table[row * numCols + col];
 }
	/*
	**********************************************************************
	* Copyright (C) 1999-2000 IBM and others. All rights reserved.
	**********************************************************************
	* Date Name Description
	* 12/1/99 rtg Ported from Java
	* 01/13/2000 helena Added UErrorCode to ctors.
	**********************************************************************
	*/

	#include "brkdict.h"
	#include "cmemory.h"
	#include "unicode/resbund.h"

	//=================================================================================
	// deserialization
	//=================================================================================

	BreakDictionary::BreakDictionary(char* dictionaryFilename, UErrorCode& status)
	{
	if (U_FAILURE(status)) return;

	ResourceBundle th((char *)0, Locale("th"), status);

	if (U_FAILURE(status)) return;

	ResourceBundle th_dict = th.get("BreakDictionaryData", status);
	if (U_FAILURE(status)) return;

	int32_t len;
	const uint8_t * data = th_dict.getBinary(len, status);
	if (U_FAILURE(status)) return;

	UMemoryStream* dictionaryStream = uprv_mstrm_openBuffer(data, len);

	if (dictionaryStream == 0) {
	status = U_FILE_ACCESS_ERROR;
	return;
	}
	readDictionaryFile(dictionaryStream);
	uprv_mstrm_close(dictionaryStream);
	}

	BreakDictionary::~BreakDictionary()
	{
	ucmp8_close(columnMap);
	delete [] table;
	delete [] rowIndex;
	delete [] rowIndexFlags;
	delete [] rowIndexFlagsIndex;
	delete [] rowIndexShifts;
	}

	// macros to support readDictionaryFile. The data files originated from a Java
	// program, and Java always writes data out in big-endian format. These macros will
	// byte-swap the data for appropriate use on Windows.

	#if U_IS_BIG_ENDIAN
	#define SWAP32(x)
	#define SWAP16(x)
	#else
	#define SWAP32(x) x = (uint32_t)((x >> 24 & 0xff) \| (x >> 8 & 0xff00) \| (x << 8 & 0xff0000) \| (x << 24 & 0xff000000))
	#define SWAP16(x) x = (uint16_t)((x << 8 & 0xff00) \| (x >> 8 & 0xff))
	#endif

	void
	BreakDictionary::readDictionaryFile(UMemoryStream* in)
	{
	int32_t l;
	int32_t version;

	int i;

	// read in the version number (right now we just ignore it)
	uprv_mstrm_read(in, &version, 4);

	// read in the column map (this is serialized in its internal form:
	// an index array followed by a data array)
	uprv_mstrm_read(in, &l, 4);
	SWAP32(l);
	uint16_t* temp = (uint16_t) uprv_malloc(sizeof(uint16_t)l);
	uprv_mstrm_read(in, temp, l * sizeof (int16_t) );
	for (i = 0; i < l; i++) {
	SWAP16(temp[i]);
	}
	uprv_mstrm_read(in, &l, 4);
	SWAP32(l);
	int8_t* temp2 = (int8_t) uprv_malloc(sizeof(int8_t)l);
	uprv_mstrm_read(in, temp2, l);
	columnMap = ucmp8_openAdopt(temp, temp2, l);

	// read in numCols and numColGroups
	uprv_mstrm_read(in, &numCols, 4);
	SWAP32(numCols);
	uprv_mstrm_read(in, &numColGroups, 4);
	SWAP32(numColGroups);

	// read in the row-number index
	uprv_mstrm_read(in, &l, 4);
	SWAP32(l);
	rowIndex = new int16_t[l];
	uprv_mstrm_read(in, rowIndex, l * sizeof (int16_t) );
	for (i = 0; i < l; i++) {
	SWAP16(rowIndex[i]);
	}

	// load in the populated-cells bitmap: index first, then bitmap list
	uprv_mstrm_read(in, &l, 4);
	SWAP32(l);
	rowIndexFlagsIndex = new int16_t[l];
	uprv_mstrm_read(in, rowIndexFlagsIndex, l * sizeof(int16_t) );
	for (i = 0; i < l; i++) {
	SWAP16(rowIndexFlagsIndex[i]);
	}
	uprv_mstrm_read(in, &l, 4);
	SWAP32(l);
	rowIndexFlags = new int32_t[l];
	uprv_mstrm_read(in, rowIndexFlags, l * sizeof(int32_t));
	for (i = 0; i < l; i++) {
	SWAP32(rowIndexFlags[i]);
	}

	// load in the row-shift index
	uprv_mstrm_read(in, &l, 4);
	SWAP32(l);
	rowIndexShifts = new int8_t[l];
	uprv_mstrm_read(in, rowIndexShifts, l);

	// finally, load in the actual state table
	uprv_mstrm_read(in, &l, 4);
	SWAP32(l);
	table = new int16_t[l];
	uprv_mstrm_read(in, table, l * sizeof(int16_t) );
	for (i = 0; i < l; i++) {
	SWAP16(table[i]);
	}

	// the reverse column map occurs next in the file. In the C/C++ code, for the
	// time being, we're not going to worry about that.
	}

	//=================================================================================
	// access to the words
	//=================================================================================

	/**
	* Uses the column map to map the character to a column number, then
	* passes the row and column number to the other version of at()
	* @param row The current state
	* @param ch The character whose column we're interested in
	* @return The new state to transition to
	*/
	int16_t
	BreakDictionary::at(int32_t row, UChar ch) const
	{
	int16_t col = ucmp8_get(columnMap, ch);
	return at(row, (int32_t)col);
	}

	/**
	* Returns the value in the cell with the specified (logical) row and
	* column numbers. In DictionaryBasedBreakIterator, the row number is
	* a state number, the column number is an input, and the return value
	* is the row number of the new state to transition to. (0 is the
	* "error" state, and -1 is the "end of word" state in a dictionary)
	* @param row The row number of the current state
	* @param col The column number of the input character (0 means "not a
	* dictionary character")
	* @return The row number of the new state to transition to
	*/
	int16_t
	BreakDictionary::at(int32_t row, int32_t col) const
	{
	if (cellIsPopulated(row, col)) {
	// we map from logical to physical row number by looking up the
	// mapping in rowIndex; we map from logical column number to
	// physical column number by looking up a shift value for this
	// logical row and offsetting the logical column number by
	// the shift amount. Then we can use internalAt() to actually
	// get the value out of the table.
	return internalAt(rowIndex[row], col + rowIndexShifts[row]);
	}
	else {
	return 0;
	}
	}

	//=================================================================================
	// implementation
	//=================================================================================
	/**
	* Given (logical) row and column numbers, returns true if the
	* cell in that position is populated
	*/
	UBool
	BreakDictionary::cellIsPopulated(int32_t row, int32_t col) const
	{
	// look up the entry in the bitmap index for the specified row.
	// If it's a negative number, it's the column number of the only
	// populated cell in the row
	if (rowIndexFlagsIndex[row] < 0) {
	return col == -rowIndexFlagsIndex[row];
	}

	// if it's a positive number, it's the offset of an entry in the bitmap
	// list. If the table is more than 32 columns wide, the bitmap is stored
	// successive entries in the bitmap list, so we have to divide the column
	// number by 32 and offset the number we got out of the index by the result.
	// Once we have the appropriate piece of the bitmap, test the appropriate
	// bit and return the result.
	else {
	int32_t flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)];
	return (flags & (1 << (col & 0x1f))) != 0;
	}
	}

	/**
	* Implementation of at() when we know the specified cell is populated.
	* @param row The PHYSICAL row number of the cell
	* @param col The PHYSICAL column number of the cell
	* @return The value stored in the cell
	*/
	int16_t
	BreakDictionary::internalAt(int32_t row, int32_t col) const
	{
	// the table is a one-dimensional array, so this just does the math necessary
	// to treat it as a two-dimensional array (we don't just use a two-dimensional
	// array because two-dimensional arrays are inefficient in Java)
	return table[row * numCols + col];
	}