blob: c4134b8c6ad6cad2b07a33d045616f56d5b85133 [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 2006-2010, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.charset;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import com.ibm.icu.impl.ICUBinary;
/* Format of cnvalias.icu -----------------------------------------------------
*
* cnvalias.icu is a binary, memory-mappable form of convrtrs.txt.
* This binary form contains several tables. All indexes are to uint16_t
* units, and not to the bytes (uint8_t units). Addressing everything on
* 16-bit boundaries allows us to store more information with small index
* numbers, which are also 16-bit in size. The majority of the table (except
* the string table) are 16-bit numbers.
*
* First there is the size of the Table of Contents (TOC). The TOC
* entries contain the size of each section. In order to find the offset
* you just need to sum up the previous offsets.
* The TOC length and entries are an array of uint32_t values.
* The first section after the TOC starts immediately after the TOC.
*
* 1) This section contains a list of converters. This list contains indexes
* into the string table for the converter name. The index of this list is
* also used by other sections, which are mentioned later on.
* This list is not sorted.
*
* 2) This section contains a list of tags. This list contains indexes
* into the string table for the tag name. The index of this list is
* also used by other sections, which are mentioned later on.
* This list is in priority order of standards.
*
* 3) This section contains a list of sorted unique aliases. This
* list contains indexes into the string table for the alias name. The
* index of this list is also used by other sections, like the 4th section.
* The index for the 3rd and 4th section is used to get the
* alias -> converter name mapping. Section 3 and 4 form a two column table.
*
* 4) This section contains a list of mapped converter names. Consider this
* as a table that maps the 3rd section to the 1st section. This list contains
* indexes into the 1st section. The index of this list is the same index in
* the 3rd section. There is also some extra information in the high bits of
* each converter index in this table. Currently it's only used to say that
* an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK
* and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is
* the predigested form of the 5th section so that an alias lookup can be fast.
*
* 5) This section contains a 2D array with indexes to the 6th section. This
* section is the full form of all alias mappings. The column index is the
* index into the converter list (column header). The row index is the index
* to tag list (row header). This 2D array is the top part a 3D array. The
* third dimension is in the 6th section.
*
* 6) This is blob of variable length arrays. Each array starts with a size,
* and is followed by indexes to alias names in the string table. This is
* the third dimension to the section 5. No other section should be referencing
* this section.
*
* 7) Reserved at this time (There is no information). This _usually_ has a
* size of 0. Future versions may add more information here.
*
* 8) This is the string table. All strings are indexed on an even address.
* There are two reasons for this. First many chip architectures locate strings
* faster on even address boundaries. Second, since all indexes are 16-bit
* numbers, this string table can be 128KB in size instead of 64KB when we
* only have strings starting on an even address.
*
*
* Here is the concept of section 5 and 6. It's a 3D cube. Each tag
* has a unique alias among all converters. That same alias can
* be mentioned in other standards on different converters,
* but only one alias per tag can be unique.
*
*
* Converter Names (Usually in TR22 form)
* -------------------------------------------.
* T / /|
* a / / |
* g / / |
* s / / |
* / / |
* ------------------------------------------/ |
* A | | |
* l | | |
* i | | /
* a | | /
* s | | /
* e | | /
* s | |/
* -------------------------------------------
*
*
*
* Here is what it really looks like. It's like swiss cheese.
* There are holes. Some converters aren't recognized by
* a standard, or they are really old converters that the
* standard doesn't recognize anymore.
*
* Converter Names (Usually in TR22 form)
* -------------------------------------------.
* T /##########################################/|
* a / # # /#
* g / # ## ## ### # ### ### ### #/
* s / # ##### #### ## ## #/#
* / ### # # ## # # # ### # # #/##
* ------------------------------------------/# #
* A |### # # ## # # # ### # # #|# #
* l |# # # # # ## # #|# #
* i |# # # # # # #|#
* a |# #|#
* s | #|#
* e
* s
*
*/
final class UConverterAliasDataReader implements ICUBinary.Authenticate {
// private final static boolean debug = ICUDebug.enabled("UConverterAliasDataReader");
/**
* <p>Protected constructor.</p>
* @param inputStream ICU uprop.dat file input stream
* @exception IOException throw if data file fails authentication
*/
protected UConverterAliasDataReader(InputStream inputStream)
throws IOException{
//if(debug) System.out.println("Bytes in inputStream " + inputStream.available());
/*unicodeVersion = */ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
//if(debug) System.out.println("Bytes left in inputStream " +inputStream.available());
dataInputStream = new DataInputStream(inputStream);
//if(debug) System.out.println("Bytes left in dataInputStream " +dataInputStream.available());
}
// protected methods -------------------------------------------------
protected int[] readToc(int n)throws IOException
{
int[] toc = new int[n];
//Read the toc
for (int i = 0; i < n ; ++i) {
toc[i] = dataInputStream.readInt() & UNSIGNED_INT_MASK;
}
return toc;
}
protected void read(int[] convList, int[] tagList, int[] aliasList, int[]untaggedConvArray, int[] taggedAliasArray, int[] taggedAliasLists, int[] optionTable, byte[] stringTable, byte[] normalizedStringTable) throws IOException{
int i;
//int listnum = 1;
//long listsize;
for(i = 0; i < convList.length; ++i)
convList[i] = dataInputStream.readUnsignedShort();
for(i = 0; i < tagList.length; ++i)
tagList[i] = dataInputStream.readUnsignedShort();
for(i = 0; i < aliasList.length; ++i)
aliasList[i] = dataInputStream.readUnsignedShort();
for(i = 0; i < untaggedConvArray.length; ++i)
untaggedConvArray[i] = dataInputStream.readUnsignedShort();
for(i = 0; i < taggedAliasArray.length; ++i)
taggedAliasArray[i] = dataInputStream.readUnsignedShort();
for(i = 0; i < taggedAliasLists.length; ++i)
taggedAliasLists[i] = dataInputStream.readUnsignedShort();
for(i = 0; i < optionTable.length; ++i)
optionTable[i] = dataInputStream.readUnsignedShort();
dataInputStream.readFully(stringTable);
dataInputStream.readFully(normalizedStringTable);
}
public boolean isDataVersionAcceptable(byte version[])
{
return version.length >= DATA_FORMAT_VERSION.length
&& version[0] == DATA_FORMAT_VERSION[0]
&& version[1] == DATA_FORMAT_VERSION[1]
&& version[2] == DATA_FORMAT_VERSION[2];
}
/*byte[] getUnicodeVersion(){
return unicodeVersion;
}*/
// private data members -------------------------------------------------
/**
* ICU data file input stream
*/
private DataInputStream dataInputStream;
// private byte[] unicodeVersion;
/**
* File format version that this class understands.
* No guarantees are made if a older version is used
* see store.c of gennorm for more information and values
*/
// DATA_FORMAT_ID_ values taken from icu4c isAcceptable (ucnv_io.c)
private static final byte DATA_FORMAT_ID[] = {(byte)0x43, (byte)0x76, (byte)0x41, (byte)0x6c}; // dataFormat="CvAl"
private static final byte DATA_FORMAT_VERSION[] = {3, 0, 1};
//private static final int UNSIGNED_SHORT_MASK = 0xffff;
private static final int UNSIGNED_INT_MASK = 0xffffffff;
}