blob: 2ff5a257639d33b581b5ae3cf868de62d347d8e5 [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 1996-2010, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.CharacterIterator;
import com.ibm.icu.impl.ICUBinary;
/*
* This is a class used to load in the compact trie dictionary file
* used for dictionary based break iteration.
*/
class BreakCTDictionary {
private CompactTrieHeader fData;
static class CompactTrieHeader {
int size; // Size of data in bytes
int magic; // Magic number (including versions)
int nodeCount; // Number of entries in offsets[]
int root; // Node number of the root node
int []offset; // Offsets to nodes from start of data
CompactTrieHeader() {
size = 0;
magic = 0;
nodeCount = 0;
root = 0;
offset = null;
}
}
static final class CompactTrieNodeFlags {
static final int kVerticalNode = 0x1000; // This is a vertical node
static final int kParentEndsWord = 0x2000; // The node whose equal link
// points to this ends a
// word
static final int kReservedFlag1 = 0x4000;
static final int kReservedFlag2 = 0x8000;
static final int kCountMask = 0x0FFF; // The count portion of
// flagscount
static final int kFlagMask = 0xF000; // The flags portion of
// flagscount
}
// The two node types are distinguished by the kVerticalNode flag.
static class CompactTrieHorizontalNode {
char ch; // UChar
int equal; // Equal link node index
CompactTrieHorizontalNode(char newCh, int newEqual) {
ch = newCh;
equal = newEqual;
}
}
static class CompactTrieVerticalNode {
int equal; // Equal link node index
char chars[]; // Code units
CompactTrieVerticalNode() {
equal = 0;
chars = null;
}
}
private CompactTrieNodes getCompactTrieNode(int node) {
return nodes[node];
}
// private class to hold both node information
static class CompactTrieNodes {
short flagscount; // Count of sub-entries, plus flags
CompactTrieHorizontalNode[] hnode;
CompactTrieVerticalNode vnode;
CompactTrieNodes() {
flagscount = 0;
hnode = null;
vnode = null;
}
}
private CompactTrieNodes[] nodes;
// Constructor
public BreakCTDictionary(InputStream is) throws IOException {
ICUBinary.readHeader(is, DATA_FORMAT_ID, null);
DataInputStream in = new DataInputStream(is);
// Get header information
fData = new CompactTrieHeader();
fData.size = in.readInt();
fData.magic = in.readInt();
fData.nodeCount = in.readShort();
fData.root = in.readShort();
loadBreakCTDictionary(in);
}
// Loads the compact trie dictionary file into the CompactTrieNodes
private void loadBreakCTDictionary(DataInputStream in) throws IOException {
// skip over offset information
for (int i = 0; i < fData.nodeCount; i++) {
in.readInt();
}
// Create compact trie dictionary
nodes = new CompactTrieNodes[fData.nodeCount];
nodes[0] = new CompactTrieNodes();
// Load in compact trie dictionary
for (int j = 1; j < fData.nodeCount; j++) {
nodes[j] = new CompactTrieNodes();
nodes[j].flagscount = in.readShort();
int count = nodes[j].flagscount & CompactTrieNodeFlags.kCountMask;
if (count != 0) {
boolean isVerticalNode = (nodes[j].flagscount & CompactTrieNodeFlags.kVerticalNode) != 0;
// Vertical node
if (isVerticalNode) {
nodes[j].vnode = new CompactTrieVerticalNode();
nodes[j].vnode.equal = in.readShort();
nodes[j].vnode.chars = new char[count];
for (int l = 0; l < count; l++) {
nodes[j].vnode.chars[l] = in.readChar();
}
} else { // Horizontal node
nodes[j].hnode = new CompactTrieHorizontalNode[count];
for (int n = 0; n < count; n++) {
nodes[j].hnode[n] = new CompactTrieHorizontalNode(in
.readChar(), in.readShort());
}
}
}
}
}
/**
* Find dictionary words that match the text.
*
* @param text A CharacterIterator representing the text. The iterator is
* left after the longest prefix match in the dictionary.
* @param maxLength The maximum number of code units to match.
* @param lengths An array that is filled with the lengths of words that matched.
* @param count Filled with the number of elements output in lengths.
* @param limit The size of the lengths array; this limits the number of words output.
* @return The number of characters in text that were matched.
*/
public int matches(CharacterIterator text, int maxLength, int lengths[],
int count[], int limit) {
// Current implementation works in UTF-16 space
CompactTrieNodes node = getCompactTrieNode(fData.root);
int mycount = 0;
char uc = text.current();
int i = 0;
boolean exitFlag = false;
while (node != null) {
// Check if the node we just exited ends a word
if (limit > 0
&& (node.flagscount & CompactTrieNodeFlags.kParentEndsWord) != 0) {
lengths[mycount++] = i;
--limit;
}
// Check that we haven't exceeded the maximum number of input
// characters.
// We have to do that here rather than in the while condition so
// that
// we can check for ending of a word above.
if (i >= maxLength) {
break;
}
int nodeCount = (node.flagscount & CompactTrieNodeFlags.kCountMask);
if (nodeCount == 0) {
// Special terminal node; return now
break;
}
if ((node.flagscount & CompactTrieNodeFlags.kVerticalNode) != 0) {
// Vertical node; check all the characters in it
CompactTrieVerticalNode vnode = node.vnode;
for (int j = 0; j < nodeCount && i < maxLength; j++) {
if (uc != vnode.chars[j]) {
// We hit a non equal character return;
exitFlag = true;
break;
}
text.next();
uc = text.current();
i++;
}
if (exitFlag) {
break;
}
// To get here, we must have come through the whole list successfully;
// go on to the next node. Note that a word cannot end in the middle
// of a vertical node.
node = getCompactTrieNode(vnode.equal);
} else {
// Horizontal node; do binary search
CompactTrieHorizontalNode[] hnode = node.hnode;
int low = 0;
int high = nodeCount - 1;
int middle;
node = null; // If we don't find a match, we'll fall out of the loop
while (high >= low) {
middle = (high + low) / 2;
if (uc == hnode[middle].ch) {
// We hit a match; get the next node and next character
node = getCompactTrieNode(hnode[middle].equal);
text.next();
uc = text.current();
i++;
break;
} else if (uc < hnode[middle].ch) {
high = middle - 1;
} else {
low = middle + 1;
}
}
}
}
count[0] = mycount;
return i;
}
// Use for reading the header portion of the file
private static final byte DATA_FORMAT_ID[] = { (byte) 0x54, (byte) 0x72,
(byte) 0x44, (byte) 0x63 };
}