src/com/ibm/icu/charset/UConverterAliasDataReader.java - external/github.com/unicode-org/icu - Git at Google

 /**
 *******************************************************************************
 * Copyright (C) 2006, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 *
 *******************************************************************************
 */

 package com.ibm.icu.charset;
 import java.io.*;

 import com.ibm.icu.impl.ICUBinary;
 import com.ibm.icu.impl.ICUDebug;

 /* Format of cnvalias.icu -----------------------------------------------------
  *
  * cnvalias.icu is a binary, memory-mappable form of convrtrs.txt.
  * This binary form contains several tables. All indexes are to uint16_t
  * units, and not to the bytes (uint8_t units). Addressing everything on
  * 16-bit boundaries allows us to store more information with small index
  * numbers, which are also 16-bit in size. The majority of the table (except
  * the string table) are 16-bit numbers.
  *
  * First there is the size of the Table of Contents (TOC). The TOC
  * entries contain the size of each section. In order to find the offset
  * you just need to sum up the previous offsets.
  * The TOC length and entries are an array of uint32_t values.
  * The first section after the TOC starts immediately after the TOC.
  *
  * 1) This section contains a list of converters. This list contains indexes
  * into the string table for the converter name. The index of this list is
  * also used by other sections, which are mentioned later on.
  * This list is not sorted.
  *
  * 2) This section contains a list of tags. This list contains indexes
  * into the string table for the tag name. The index of this list is
  * also used by other sections, which are mentioned later on.
  * This list is in priority order of standards.
  *
  * 3) This section contains a list of sorted unique aliases. This
  * list contains indexes into the string table for the alias name. The
  * index of this list is also used by other sections, like the 4th section.
  * The index for the 3rd and 4th section is used to get the
  * alias -> converter name mapping. Section 3 and 4 form a two column table.
  *
  * 4) This section contains a list of mapped converter names. Consider this
  * as a table that maps the 3rd section to the 1st section. This list contains
  * indexes into the 1st section. The index of this list is the same index in
  * the 3rd section. There is also some extra information in the high bits of
  * each converter index in this table. Currently it's only used to say that
  * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK
  * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is
  * the predigested form of the 5th section so that an alias lookup can be fast.
  *
  * 5) This section contains a 2D array with indexes to the 6th section. This
  * section is the full form of all alias mappings. The column index is the
  * index into the converter list (column header). The row index is the index
  * to tag list (row header). This 2D array is the top part a 3D array. The
  * third dimension is in the 6th section.
  *
  * 6) This is blob of variable length arrays. Each array starts with a size,
  * and is followed by indexes to alias names in the string table. This is
  * the third dimension to the section 5. No other section should be referencing
  * this section.
  *
  * 7) Reserved at this time (There is no information). This _usually_ has a
  * size of 0. Future versions may add more information here.
  *
  * 8) This is the string table. All strings are indexed on an even address.
  * There are two reasons for this. First many chip architectures locate strings
  * faster on even address boundaries. Second, since all indexes are 16-bit
  * numbers, this string table can be 128KB in size instead of 64KB when we
  * only have strings starting on an even address.
  *
  *
  * Here is the concept of section 5 and 6. It's a 3D cube. Each tag
  * has a unique alias among all converters. That same alias can
  * be mentioned in other standards on different converters,
  * but only one alias per tag can be unique.
  *
  *
  *              Converter Names (Usually in TR22 form)
  *           -------------------------------------------.
  *     T    /                                          /|
  *     a   /                                          / |
  *     g  /                                          /  |
  *     s /                                          /   |
  *      /                                          /    |
  *      ------------------------------------------/     |
  *    A |                                         |     |
  *    l |                                         |     |
  *    i |                                         |    /
  *    a |                                         |   /
  *    s |                                         |  /
  *    e |                                         | /
  *    s |                                         |/
  *      -------------------------------------------
  *
  *
  *
  * Here is what it really looks like. It's like swiss cheese.
  * There are holes. Some converters aren't recognized by
  * a standard, or they are really old converters that the
  * standard doesn't recognize anymore.
  *
  *              Converter Names (Usually in TR22 form)
  *           -------------------------------------------.
  *     T    /##########################################/|
  *     a   /     #            #                       /#
  *     g  /  #      ##     ##     ### # ### ### ### #/
  *     s / #             #####  ####        ##  ## #/#
  *      / ### # # ##  #  #   #          ### # #   #/##
  *      ------------------------------------------/# #
  *    A |### # # ##  #  #   #          ### # #   #|# #
  *    l |# # #    #     #               ## #     #|# #
  *    i |# # #    #     #                #       #|#
  *    a |#                                       #|#
  *    s |                                        #|#
  *    e
  *    s
  *
  */

 final class UConverterAliasDataReader implements ICUBinary.Authenticate {
     private final static boolean debug = ICUDebug.enabled("UConverterAliasDataReader");

    /**
     * <p>Protected constructor.</p>
     * @param inputStream ICU uprop.dat file input stream
     * @exception IOException throw if data file fails authentication
     * @draft 2.1
     */
     protected UConverterAliasDataReader(InputStream inputStream)
                                         throws IOException{
         if(debug) System.out.println("Bytes in inputStream " + inputStream.available());

         unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);

         if(debug) System.out.println("Bytes left in inputStream " +inputStream.available());

         dataInputStream = new DataInputStream(inputStream);

         if(debug) System.out.println("Bytes left in dataInputStream " +dataInputStream.available());
     }

     // protected methods -------------------------------------------------

 	protected int[] readToc(int n)throws IOException
 	{
 		int[] toc = new int[n];
 		//Read the toc
 		for (int i = 0; i < n ; ++i) {
 			toc[i] = dataInputStream.readInt() & UNSIGNED_INT_MASK;
 		}
 		return toc;
 	}

     protected void read(int[] convList, int[] tagList, int[] aliasList, int[]untaggedConvArray, int[] taggedAliasArray, int[] taggedAliasLists, int[] optionTable, byte[] stringTable, byte[] normalizedStringTable) throws IOException{
 		int i;
 		//int listnum = 1;
 		//long listsize;

 		for(i = 0; i < convList.length; ++i)
 			convList[i] = dataInputStream.readUnsignedShort();

 		for(i = 0; i < tagList.length; ++i)
 			tagList[i] = dataInputStream.readUnsignedShort();

 		for(i = 0; i < aliasList.length; ++i)
 			aliasList[i] = dataInputStream.readUnsignedShort();

 		for(i = 0; i < untaggedConvArray.length; ++i)
 			untaggedConvArray[i] = dataInputStream.readUnsignedShort();

 		for(i = 0; i < taggedAliasArray.length; ++i)
 			taggedAliasArray[i] = dataInputStream.readUnsignedShort();

 		for(i = 0; i < taggedAliasLists.length; ++i)
 			taggedAliasLists[i] = dataInputStream.readUnsignedShort();

         for(i = 0; i < optionTable.length; ++i)
             optionTable[i] = dataInputStream.readUnsignedShort();

 		dataInputStream.read(stringTable);
         dataInputStream.read(normalizedStringTable);
     }

     public boolean isDataVersionAcceptable(byte version[])
     {
         return version.length >= DATA_FORMAT_VERSION.length
             && version[0] == DATA_FORMAT_VERSION[0]
             && version[1] == DATA_FORMAT_VERSION[1]
             && version[2] == DATA_FORMAT_VERSION[2];
     }

     public byte[] getUnicodeVersion(){
         return unicodeVersion;
     }
     // private data members -------------------------------------------------


     /**
     * ICU data file input stream
     */
     private DataInputStream dataInputStream;

     private byte[] unicodeVersion;

     /**
     * File format version that this class understands.
     * No guarantees are made if a older version is used
     * see store.c of gennorm for more information and values
     */
 		// DATA_FORMAT_ID_ values taken from icu4c isAcceptable (ucnv_io.c)
     private static final byte DATA_FORMAT_ID[] = {(byte)0x43, (byte)0x76, (byte)0x41, (byte)0x6c}; // dataFormat="CvAl"
     private static final byte DATA_FORMAT_VERSION[] = {3, 0, 1};

     //private static final int UNSIGNED_SHORT_MASK = 0xffff;
     private static final int UNSIGNED_INT_MASK = 0xffffffff;

 }
	/**
	*******************************************************************************
	* Copyright (C) 2006, International Business Machines Corporation and *
	* others. All Rights Reserved. *
	*******************************************************************************
	*
	*******************************************************************************
	*/

	package com.ibm.icu.charset;
	import java.io.*;

	import com.ibm.icu.impl.ICUBinary;
	import com.ibm.icu.impl.ICUDebug;

	/* Format of cnvalias.icu -----------------------------------------------------
	*
	* cnvalias.icu is a binary, memory-mappable form of convrtrs.txt.
	* This binary form contains several tables. All indexes are to uint16_t
	* units, and not to the bytes (uint8_t units). Addressing everything on
	* 16-bit boundaries allows us to store more information with small index
	* numbers, which are also 16-bit in size. The majority of the table (except
	* the string table) are 16-bit numbers.
	*
	* First there is the size of the Table of Contents (TOC). The TOC
	* entries contain the size of each section. In order to find the offset
	* you just need to sum up the previous offsets.
	* The TOC length and entries are an array of uint32_t values.
	* The first section after the TOC starts immediately after the TOC.
	*
	* 1) This section contains a list of converters. This list contains indexes
	* into the string table for the converter name. The index of this list is
	* also used by other sections, which are mentioned later on.
	* This list is not sorted.
	*
	* 2) This section contains a list of tags. This list contains indexes
	* into the string table for the tag name. The index of this list is
	* also used by other sections, which are mentioned later on.
	* This list is in priority order of standards.
	*
	* 3) This section contains a list of sorted unique aliases. This
	* list contains indexes into the string table for the alias name. The
	* index of this list is also used by other sections, like the 4th section.
	* The index for the 3rd and 4th section is used to get the
	* alias -> converter name mapping. Section 3 and 4 form a two column table.
	*
	* 4) This section contains a list of mapped converter names. Consider this
	* as a table that maps the 3rd section to the 1st section. This list contains
	* indexes into the 1st section. The index of this list is the same index in
	* the 3rd section. There is also some extra information in the high bits of
	* each converter index in this table. Currently it's only used to say that
	* an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK
	* and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is
	* the predigested form of the 5th section so that an alias lookup can be fast.
	*
	* 5) This section contains a 2D array with indexes to the 6th section. This
	* section is the full form of all alias mappings. The column index is the
	* index into the converter list (column header). The row index is the index
	* to tag list (row header). This 2D array is the top part a 3D array. The
	* third dimension is in the 6th section.
	*
	* 6) This is blob of variable length arrays. Each array starts with a size,
	* and is followed by indexes to alias names in the string table. This is
	* the third dimension to the section 5. No other section should be referencing
	* this section.
	*
	* 7) Reserved at this time (There is no information). This _usually_ has a
	* size of 0. Future versions may add more information here.
	*
	* 8) This is the string table. All strings are indexed on an even address.
	* There are two reasons for this. First many chip architectures locate strings
	* faster on even address boundaries. Second, since all indexes are 16-bit
	* numbers, this string table can be 128KB in size instead of 64KB when we
	* only have strings starting on an even address.
	*
	*
	* Here is the concept of section 5 and 6. It's a 3D cube. Each tag
	* has a unique alias among all converters. That same alias can
	* be mentioned in other standards on different converters,
	* but only one alias per tag can be unique.
	*
	*
	* Converter Names (Usually in TR22 form)
	* -------------------------------------------.
	* T / /\|
	* a / / \|
	* g / / \|
	* s / / \|
	* / / \|
	* ------------------------------------------/ \|
	* A \| \| \|
	* l \| \| \|
	* i \| \| /
	* a \| \| /
	* s \| \| /
	* e \| \| /
	* s \| \|/
	* -------------------------------------------
	*
	*
	*
	* Here is what it really looks like. It's like swiss cheese.
	* There are holes. Some converters aren't recognized by
	* a standard, or they are really old converters that the
	* standard doesn't recognize anymore.
	*
	* Converter Names (Usually in TR22 form)
	* -------------------------------------------.
	* T /##########################################/\|
	* a / # # /#
	* g / # ## ## ### # ### ### ### #/
	* s / # ##### #### ## ## #/#
	* / ### # # ## # # # ### # # #/##
	* ------------------------------------------/# #
	* A \|### # # ## # # # ### # # #\|# #
	* l \|# # # # # ## # #\|# #
	* i \|# # # # # # #\|#
	* a \|# #\|#
	* s \| #\|#
	* e
	* s
	*
	*/

	final class UConverterAliasDataReader implements ICUBinary.Authenticate {
	private final static boolean debug = ICUDebug.enabled("UConverterAliasDataReader");

	/**
	* <p>Protected constructor.</p>
	* @param inputStream ICU uprop.dat file input stream
	* @exception IOException throw if data file fails authentication
	* @draft 2.1
	*/
	protected UConverterAliasDataReader(InputStream inputStream)
	throws IOException{
	if(debug) System.out.println("Bytes in inputStream " + inputStream.available());

	unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);

	if(debug) System.out.println("Bytes left in inputStream " +inputStream.available());

	dataInputStream = new DataInputStream(inputStream);

	if(debug) System.out.println("Bytes left in dataInputStream " +dataInputStream.available());
	}

	// protected methods -------------------------------------------------

	protected int[] readToc(int n)throws IOException
	{
	int[] toc = new int[n];
	//Read the toc
	for (int i = 0; i < n ; ++i) {
	toc[i] = dataInputStream.readInt() & UNSIGNED_INT_MASK;
	}
	return toc;
	}

	protected void read(int[] convList, int[] tagList, int[] aliasList, int[]untaggedConvArray, int[] taggedAliasArray, int[] taggedAliasLists, int[] optionTable, byte[] stringTable, byte[] normalizedStringTable) throws IOException{
	int i;
	//int listnum = 1;
	//long listsize;

	for(i = 0; i < convList.length; ++i)
	convList[i] = dataInputStream.readUnsignedShort();

	for(i = 0; i < tagList.length; ++i)
	tagList[i] = dataInputStream.readUnsignedShort();

	for(i = 0; i < aliasList.length; ++i)
	aliasList[i] = dataInputStream.readUnsignedShort();

	for(i = 0; i < untaggedConvArray.length; ++i)
	untaggedConvArray[i] = dataInputStream.readUnsignedShort();

	for(i = 0; i < taggedAliasArray.length; ++i)
	taggedAliasArray[i] = dataInputStream.readUnsignedShort();

	for(i = 0; i < taggedAliasLists.length; ++i)
	taggedAliasLists[i] = dataInputStream.readUnsignedShort();

	for(i = 0; i < optionTable.length; ++i)
	optionTable[i] = dataInputStream.readUnsignedShort();

	dataInputStream.read(stringTable);
	dataInputStream.read(normalizedStringTable);
	}

	public boolean isDataVersionAcceptable(byte version[])
	{
	return version.length >= DATA_FORMAT_VERSION.length
	&& version[0] == DATA_FORMAT_VERSION[0]
	&& version[1] == DATA_FORMAT_VERSION[1]
	&& version[2] == DATA_FORMAT_VERSION[2];
	}

	public byte[] getUnicodeVersion(){
	return unicodeVersion;
	}
	// private data members -------------------------------------------------


	/**
	* ICU data file input stream
	*/
	private DataInputStream dataInputStream;

	private byte[] unicodeVersion;

	/**
	* File format version that this class understands.
	* No guarantees are made if a older version is used
	* see store.c of gennorm for more information and values
	*/
	// DATA_FORMAT_ID_ values taken from icu4c isAcceptable (ucnv_io.c)
	private static final byte DATA_FORMAT_ID[] = {(byte)0x43, (byte)0x76, (byte)0x41, (byte)0x6c}; // dataFormat="CvAl"
	private static final byte DATA_FORMAT_VERSION[] = {3, 0, 1};

	//private static final int UNSIGNED_SHORT_MASK = 0xffff;
	private static final int UNSIGNED_INT_MASK = 0xffffffff;

	}