blob: 97dff354de48c91dec9d093d9c2033d9b0ae025f [file] [log] [blame]
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UCharacterNameDB.java,v $
* $Date: 2001/03/23 19:51:38 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
package com.ibm.text;
import java.io.InputStream;
import java.io.DataInputStream;
import java.io.BufferedInputStream;
import java.io.IOException;
/**
* Internal class used for Unicode character name database.
* Database classes store binary data read from uprops.dat and unames for use.
* It does not have the capability to parse the data into more high-level
* information. It only returns bytes of information when required.
* Due to the form most commonly used for retrieval, array of char is used
* to store the binary data
* UCharacterNameDB also contains indexes to significant points in the binary
* data.
* Responsibility for molding the binary data into more meaning form lies on
* <a href=UCharacterPpty.html>UCharacterPpty</a> and
* <a href=UCharacterName.html>UCharacterName</a>.
* Data populated by <a href=UGenNameReader.html>UGenNameReader</a>
* @author Syn Wee Quek
* @since oct2700
* @see com.ibm.text.UGenReader
*/
final class UCharacterNameDB extends UCharacterDB
{
// private variable =============================================
/**
* Data used in unames.dat
*/
private char m_tokentable_[];
private byte m_tokenstring_[];
private char m_groupinfo_[];
private byte m_groupstring_[];
private AlgorithmName m_algorithm_[];
/**
* Number of group sets
*/
private int m_groupcount_ = 0;
private int m_groupsize_ = 0;
/**
* Default name of the name datafile
*/
private static final String NAME_FILE_NAME_ = "unames.dat";
/**
* Default buffer size of datafile
*/
private static final int NAME_BUFFER_SIZE_ = 100000;
/**
* Shift count to retrieve group information
*/
private static final int GROUP_SHIFT_ = 5;
/**
* Number of lines per group
*/
private static final int LINES_PER_GROUP_ = 1 << GROUP_SHIFT_;
/**
* Mask to retrieve the offset for a particular character within a group
*/
private static final int GROUP_MASK_ = LINES_PER_GROUP_ - 1;
/**
* Position of offsethigh in group information array
*/
private static final int OFFSET_HIGH_OFFSET_ = 1;
/**
* Position of offsetlow in group information array
*/
private static final int OFFSET_LOW_OFFSET_ = 2;
/**
* Indicator of if Unicode 1.0 names are available
*/
private static boolean UNICODE_1_;
/**
* Double nibble indicator, any nibble > this number has to be combined
* with its following nibble
*/
private static final int SINGLE_NIBBLE_MAX_ = 11;
// constructor ====================================================
/**
* protected constructor
* @exception thrown when data reading fails or when data has been corrupted
*/
protected UCharacterNameDB() throws IOException
{
UGenNameReader reader = new UGenNameReader();
InputStream i = getClass().getResourceAsStream(NAME_FILE_NAME_);
BufferedInputStream b = new BufferedInputStream(i, NAME_BUFFER_SIZE_);
DataInputStream d = new DataInputStream(b);
reader.read(d, this);
d.close();
UNICODE_1_ = (';' >= m_tokentable_.length) ||
(m_tokentable_[(int)';'] == 0xFFFF);
}
// public method ==================================================
/**
* toString method for printing
*/
public String toString()
{
StringBuffer result = new StringBuffer("names content \n");
/*result.append(super.toString());
result.append('\n');
result.append("token string offset ");
result.append(m_tokenstringoffset_);
result.append("\n");
result.append("group offset ");
result.append(m_groupsoffset_);
result.append("\n");
result.append("group string offset ");
result.append(m_groupstringoffset_);
result.append("\n");
result.append("alg names offset ");
result.append(m_algnamesoffset_);
result.append("\n");
*/
return result.toString();
}
// protected methods ===============================================
/**
* Sets the token data
* @param token array of tokens
* @param tokenstring array of string values of the tokens
* @return false if there is a data error
*/
protected boolean setToken(char token[], byte tokenstring[])
{
if (token != null && tokenstring != null && token.length > 0 &&
tokenstring.length > 0) {
m_tokentable_ = token;
m_tokenstring_ = tokenstring;
return true;
}
return false;
}
/**
* Sets the number of group and size of each group in number of char
* @param count number of groups
* @param size size of group in char
* @return true if group size is set correctly
*/
protected boolean setGroupCountSize(int count, int size)
{
if (count <= 0 || size <= 0) {
return false;
}
m_groupcount_ = count;
m_groupsize_ = size;
return true;
}
/**
* Sets the group name data
* @param group index information array
* @param groupstring name information array
* @return false if there is a data error
*/
protected boolean setGroup(char group[], byte groupstring[])
{
if (group != null && groupstring != null && group.length > 0 &&
groupstring.length > 0) {
m_groupinfo_ = group;
m_groupstring_ = groupstring;
return true;
}
return false;
}
/**
* Binary search for the group strings set that contains the argument Unicode
* code point's most significant bits.
* The return value is always a valid group string set that contain msb.
* If group string set is not found, -1 is returned
* @param ch the code point to look for
* @return group string set index in datatable otherwise -1 is returned if
* group string set is not found
*/
protected int getGroupStringIndex(int ch)
{
// gets the msb
int msb = ch >> GROUP_SHIFT_,
end = m_groupcount_,
start,
gindex = 0;
// binary search for the group of names that contains the one for code
for (start = 0; start < end - 1;) {
gindex = (start + end) >> 1;
if (msb < getGroupMSB(gindex)) {
end = gindex;
}
else {
start = gindex;
}
}
// return this if it is an exact match
if (msb == getGroupMSB(start)) {
start = start * m_groupsize_;
return UCharacterUtil.toInt(m_groupinfo_[start + OFFSET_HIGH_OFFSET_],
m_groupinfo_[start + OFFSET_LOW_OFFSET_]);
}
return -1;
}
/**
* Returns the number of the group information object
* @return number of group information object
*/
protected int countGroup()
{
return m_groupcount_;
}
/**
* Gets the group name of the character
* @param ch character to get the group name
* @param choice name choice selector to choose a unicode 1.0 or newer name
*/
protected String getGroupName(int ch, int choice)
{
if (choice != UCharacterNameChoice.U_UNICODE_CHAR_NAME && !UNICODE_1_) {
// if not modern name requested and semicolon byte value is a character,
// not a token number, otherwise since only modern names are stored in
// unames.dat and there is no such requested Unicode 1.0 name here
return null;
}
// gets the msb
int msb = ch >> GROUP_SHIFT_,
end = m_groupcount_,
start,
gindex = 0;
// binary search for the group of names that contains the one for code
for (start = 0; start < end - 1;) {
gindex = (start + end) >> 1;
if (msb < getGroupMSB(gindex)) {
end = gindex;
}
else {
start = gindex;
}
}
// return this if it is an exact match
if (msb == getGroupMSB(start)) {
char offsets[] = new char[LINES_PER_GROUP_ + 1];
char lengths[] = new char[LINES_PER_GROUP_ + 1];
int index = getGroupLengths(start, offsets, lengths);
int offset = ch & GROUP_MASK_;
return getGroupName(index + offsets[offset], lengths[offset], choice);
}
return null;
}
/**
* Getting the character with the tokenized argument name
* @param index of the group to check
* @param name of the character
* @param choice of Unicode version used
* @return character with the tokenized argument name or -1 if character is
* not found
*/
protected int getGroupChar(int index, String name, int choice)
{
if (choice != UCharacterNameChoice.U_UNICODE_CHAR_NAME &&
!UNICODE_1_) {
// semicolon byte value is a token number , therefore only modern
// names are stored in unames.dat and there is no such requested
// Unicode 1.0 name here
return -1;
}
// populating the data set of grouptable
char offsets[] = new char[LINES_PER_GROUP_ + 1];
char lengths[] = new char[LINES_PER_GROUP_ + 1];
int startgpstrindex = getGroupLengths(index, offsets, lengths);
// shift out to function
int result = getGroupChar(startgpstrindex, lengths, name, choice);
if (result != -1) {
return (getGroupMSB(index) << GROUP_SHIFT_) | result;
}
return -1;
}
/**
* Set the algorithm name information array
* @param algorithm information array
* @return true if the group string offset has been set correctly
*/
protected boolean setAlgorithm(AlgorithmName alg[])
{
if (alg != null && alg.length != 0) {
m_algorithm_ = alg;
return true;
}
return false;
}
/**
* Get the number of algorithm name groups
* @return number of algorithm name groups
*/
protected int countAlgorithm()
{
if (m_algorithm_ == null) {
return 0;
}
return m_algorithm_.length;
}
/**
* Gets the index of the Algorithm object the argument code point lies
* @param ch code point
* @return index of the Algorithm object the argument code point lies,
* otherwise -1 if code point is not found in Algorithm objects
*/
protected int getAlgorithmIndex(int ch)
{
for (int index = m_algorithm_.length - 1; index >= 0; index --) {
if (m_algorithm_[index].contains(ch)) {
return index;
}
}
return -1;
}
/**
* Appends algorithm name of code point into StringBuffer.
* Note this method does not check for validity of code point in Algorithm,
* result is undefined if code point does not belong in Algorithm.
* @param index of Algorithm object in array
* @param ch code point
* @param str StringBuffer to append to
*/
protected void appendAlgorithmName(int index, int ch, StringBuffer str)
{
m_algorithm_[index].appendName(ch, str);
}
/**
* Get algorithm code point for the argument name at index. If name is not
* found in algorithm, -1 is returned.
* @param index algorithm index
* @param name code point name
* @param code point in algorithm that matches name, -1 otherwise
*/
protected int getAlgorithmChar(int index, String name)
{
return m_algorithm_[index].getAlgorithmChar(name);
}
// private methods =================================================
/**
* Gets the most significant bits representation in the argument group
* @param index the indexth group in datatable
* @return most significant bits representation of group
*/
private char getGroupMSB(int index)
{
return m_groupinfo_[index * m_groupsize_];
}
/**
* Reads a block of compressed lengths of 32 strings and expands them into
* offsets and lengths for each string. Lengths are stored with a
* variable-width encoding in consecutive nibbles:
* If a nibble<0xc, then it is the length itself (0 = empty string).
* If a nibble>=0xc, then it forms a length value with the following nibble.
* The offsets and lengths arrays must be at least 33 (one more) long because
* there is no check here at the end if the last nibble is still used.
* @param index of group string object in array
* @param offsets array to store the value of the string offsets
* @param lengths array to store the value of the string length
* @return next index of the data string immediately after the lengths
* in terms of byte address
*/
private int getGroupLengths(int index, char offsets[], char lengths[])
{
char length = 0xffff;
byte b = 0,
n = 0;
int shift;
index = index * m_groupsize_; // byte count offsets of group strings
int stringoffset = UCharacterUtil.toInt(
m_groupinfo_[index + OFFSET_HIGH_OFFSET_],
m_groupinfo_[index + OFFSET_LOW_OFFSET_]);
offsets[0] = 0;
// all 32 lengths must be read to get the offset of the first group string
for (int i = 0; i < LINES_PER_GROUP_; stringoffset ++) {
b = m_groupstring_[stringoffset];
shift = 4;
while (shift >= 0) {
// getting nibble
n = (byte)((b >> shift) & 0x0F);
if (length == 0xffff && n > SINGLE_NIBBLE_MAX_) {
length = (char)((n - 12) << 4);
}
else {
if (length != 0xffff) {
lengths[i] = (char)((length | n) + 12);
}
else {
lengths[i] = (char)n;
}
if (i < LINES_PER_GROUP_) {
offsets[i + 1] = (char)(offsets[i] + lengths[i]);
}
length = 0xffff;
i ++;
}
shift -= 4;
}
}
return stringoffset;
}
/**
* Gets the name of the argument group index
* @param index of the group name string in byte count
* @param length of the group name string
* @param choice of Unicode 1.0 name or the most current name
* @return name of the group
*/
private String getGroupName(int index, int length, int choice)
{
if (choice != UCharacterNameChoice.U_UNICODE_CHAR_NAME) {
int oldindex = index;
index += UCharacterUtil.skipByteSubString(m_groupstring_, index, length,
(byte)';');
length -= (index - oldindex);
}
StringBuffer s = new StringBuffer();
byte b;
char token;
for (int i = 0; i < length;) {
b = m_groupstring_[index + i];
i ++;
if (b >= m_tokentable_.length) {
if (b == ';') {
break;
}
s.append(b); // implicit letter
}
else {
token = m_tokentable_[b & 0x00ff];
if (token == 0xFFFE) {
// this is a lead byte for a double-byte token
token = m_tokentable_[b << 8 | (m_groupstring_[index + i] & 0x00ff)];
i ++;
}
if (token == 0xFFFF) {
if (b == ';') {
break;
}
s.append((char)(b & 0x00ff)); // explicit letter
}
else { // write token word
UCharacterUtil.getNullTermByteSubString(s, m_tokenstring_, token);
}
}
}
if (s.length() == 0) {
return null;
}
return s.toString();
}
/**
* Compares and retrieve character if name is found within the argument
* group
* @param index index where the set of names reside in the group block
* @param length list of lengths of the strings
* @param name character name to search for
* @param choice of either 1.0 or the most current unicode name
* @return relative character in the group which matches name, otherwise if
* not found, -1 will be returned
*/
private int getGroupChar(int index, char length[], String name, int choice)
{
byte b = 0;
char token;
int len;
int namelen = name.length();
int nindex;
int count;
for (int result = 0; result <= LINES_PER_GROUP_; result ++) {
nindex = 0;
len = length[result];
if (choice != UCharacterNameChoice.U_UNICODE_CHAR_NAME) {
int oldindex = index;
index += UCharacterUtil.skipByteSubString(m_groupstring_, index, len,
(byte)';');
len -= (index - oldindex);
}
// number of tokens is > the length of the name
// write each letter directly, and write a token word per token
for (count = 0; count < len && nindex != -1 && nindex < namelen;) {
b = m_groupstring_[index + count];
count ++;
if (b >= m_tokentable_.length) {
if (name.charAt(nindex ++) != (b & 0xFF)) {
nindex = -1;
}
}
else {
token = m_tokentable_[b & 0xFF];
if (token == 0xFFFE) {
// this is a lead byte for a double-byte token
token = m_tokentable_[b << 8 |
(m_groupstring_[index + count] & 0x00ff)];
count ++;
}
if (token == 0xFFFF) {
if (name.charAt(nindex ++) != (b & 0xFF)) {
nindex = -1;
}
}
else {
// compare token with name
nindex = UCharacterUtil.compareNullTermByteSubString(name,
m_tokenstring_, nindex, token);
}
}
}
if (namelen == nindex &&
(count == len || m_groupstring_[index + count] == ';')) {
return result;
}
index += len;
}
return -1;
}
// protected inner class ===========================================
/**
* Algorithmic name class
*/
static final class AlgorithmName
{
// protected variables ===========================================
/**
* Constant type value of the different AlgorithmName
*/
protected static final int TYPE_0_ = 0;
protected static final int TYPE_1_ = 1;
// private variables =============================================
/**
* Algorithmic data information
*/
private int m_rangestart_;
private int m_rangeend_;
private byte m_type_;
private byte m_variant_;
private char m_factor_[];
private String m_prefix_;
private byte m_factorstring_[];
// constructor ===================================================
/**
* Constructor
*/
protected AlgorithmName()
{
}
// protected methods =============================================
/**
* Sets the information for accessing the algorithmic names
* @param rangestart starting code point that lies within this name group
* @param rangeend end code point that lies within this name group
* @param type algorithm type. There's 2 kinds of algorithmic type. First
* which uses code point as part of its name and the other uses
* variant postfix strings
* @param variant algorithmic variant
* @return true if values are valid
*/
protected boolean setInfo(int rangestart, int rangeend, byte type,
byte variant)
{
if (rangestart >= UCharacter.MIN_VALUE && rangestart <= rangeend &&
rangeend <= UCharacter.MAX_VALUE &&
(type == TYPE_0_ || type == TYPE_1_)) {
m_rangestart_ = rangestart;
m_rangeend_ = rangeend;
m_type_ = type;
m_variant_ = variant;
return true;
}
return false;
}
/**
* Sets the factor data
* @param array of factor
* @return true if factors are valid
*/
protected boolean setFactor(char factor[])
{
if (factor.length == m_variant_) {
m_factor_ = factor;
return true;
}
return false;
}
/**
* Sets the name prefix
* @param prefix
* @return true if prefix is set
*/
protected boolean setPrefix(String prefix)
{
if (prefix != null && prefix.length() > 0) {
m_prefix_ = prefix;
return true;
}
return false;
}
/**
* Sets the variant factorized name data
* @param string variant factorized name data
* @return true if values are set
*/
protected boolean setFactorString(byte string[])
{
// factor and variant string can be empty for things like hanggul code
// points
m_factorstring_ = string;
return true;
}
/**
* Checks if code point lies in Algorithm object at index
* @param ch code point
*/
protected boolean contains(int ch)
{
return m_rangestart_ <= ch && ch <= m_rangeend_;
}
/**
* Appends algorithm name of code point into StringBuffer.
* Note this method does not check for validity of code point in Algorithm,
* result is undefined if code point does not belong in Algorithm.
* @param ch code point
* @param str StringBuffer to append to
*/
protected void appendName(int ch, StringBuffer str)
{
str.append(m_prefix_);
switch (m_type_)
{
case TYPE_0_:
// prefix followed by hex digits indicating variants
str.append(Integer.toHexString(ch));
break;
case TYPE_1_:
// prefix followed by factorized-elements
int offset = ch - m_rangestart_;
int indexes[] = new int[m_variant_];
int factor;
// write elements according to the factors
// the factorized elements are determined by modulo arithmetic
for (int i = m_variant_ - 1; i > 0; i --)
{
factor = m_factor_[i] & 0x00FF;
indexes[i] = offset % factor;
offset /= factor;
}
// we don't need to calculate the last modulus because
// start <= code <= end guarantees here that code <= factors[0]
indexes[0] = offset;
// joining up the factorized strings
String s[] = getFactorString(indexes);
if (s != null && s.length > 0)
{
int size = s.length;
for (int i = 0; i < size; i ++)
str.append(s[i]);
}
break;
}
}
/**
* Gets the character for the argument algorithmic name
* @return the algorithmic char or -1 otherwise.
*/
protected int getAlgorithmChar(String name)
{
int prefixlen = m_prefix_.length();
if (name.length() < prefixlen ||
!m_prefix_.equals(name.substring(0, prefixlen))) {
return -1;
}
switch (m_type_)
{
case TYPE_0_ :
try
{
int result = Integer.parseInt(name.substring(prefixlen), 16);
// does it fit into the range?
if (m_rangestart_ <= result && result <= m_rangeend_) {
return result;
}
}
catch (NumberFormatException e)
{
}
break;
case TYPE_1_ :
// repetitative suffix name comparison done here
// offset is the character code - start
for (int ch = m_rangestart_; ch <= m_rangeend_; ch ++)
{
int offset = ch - m_rangestart_;
int indexes[] = new int[m_variant_];
int factor;
// write elements according to the factors
// the factorized elements are determined by modulo arithmetic
for (int i = m_variant_ - 1; i > 0; i --)
{
factor = m_factor_[i] & 0x00FF;
indexes[i] = offset % factor;
offset /= factor;
}
// we don't need to calculate the last modulus because
// start <= code <= end guarantees here that code <= factors[0]
indexes[0] = offset;
// joining up the factorized strings
if (compareFactorString(indexes, name.substring(prefixlen))) {
return ch;
}
}
}
return -1;
}
// private methods ================================================
/**
* Gets the indexth string in each of the argument factor block
* @param index array with each index corresponding to each factor block
* @return array of indexth factor string in factor block
*/
private String[] getFactorString(int index[])
{
int size = m_factor_.length;
if (index == null || index.length != size) {
return null;
}
String result[] = new String[size];
StringBuffer str = new StringBuffer();
int count = 0;
int factor;
size --;
for (int i = 0; i <= size; i ++) {
factor = m_factor_[i];
count = UCharacterUtil.skipNullTermByteSubString(m_factorstring_,
count, index[i]);
count = UCharacterUtil.getNullTermByteSubString(str, m_factorstring_,
count);
if (i != size) {
count = UCharacterUtil.skipNullTermByteSubString(m_factorstring_,
count, factor - index[i] - 1);
}
result[i] = str.toString();
str.delete(0, str.length());
}
return result;
}
/**
* Compares the indexth string in each of the argument factor block with
* the argument string
* @param index array with each index corresponding to each factor block
* @param str string to compare with
* @return true if string matches
*/
private boolean compareFactorString(int index[], String str)
{
int size = m_factor_.length;
if (index == null || index.length != size)
return false;
int count = 0;
int strcount = 0;
int factor;
size --;
for (int i = 0; i <= size; i ++)
{
factor = m_factor_[i];
count = UCharacterUtil.skipNullTermByteSubString(m_factorstring_,
count, index[i]);
strcount = UCharacterUtil.compareNullTermByteSubString(str,
m_factorstring_, strcount, count);
if (strcount < 0) {
return false;
}
if (i != size) {
count = UCharacterUtil.skipNullTermByteSubString(m_factorstring_,
count, factor - index[i]);
}
}
if (strcount != str.length()) {
return false;
}
return true;
}
}
}