blob: b780d7e733964b6e26bc4bf46e229d329690e58e [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 2009-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.VersionInfo;
class Normalizer2Impl {
public static final class Hangul {
/* Korean Hangul and Jamo constants */
public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */
public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */
public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */
public static final int HANGUL_BASE=0xac00;
public static final int JAMO_L_COUNT=19;
public static final int JAMO_V_COUNT=21;
public static final int JAMO_T_COUNT=28;
public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT;
public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT;
public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
public static final boolean isHangul(int c) {
return HANGUL_BASE<=c && c<HANGUL_LIMIT;
}
public static final boolean isHangulWithoutJamoT(char c) {
c-=HANGUL_BASE;
return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
}
public static final boolean isJamoL(int c) {
return JAMO_L_BASE<=c && c<JAMO_L_LIMIT;
}
public static final boolean isJamoV(int c) {
return JAMO_V_BASE<=c && c<JAMO_V_LIMIT;
}
/**
* Decomposes c, which must be a Hangul syllable, into buffer
* and returns the length of the decomposition (2 or 3).
*/
public static final int decompose(int c, StringBuilder buffer) {
c-=HANGUL_BASE;
int c2=c%JAMO_T_COUNT;
c/=JAMO_T_COUNT;
buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
if(c2==0) {
return 2;
} else {
buffer.append((char)(JAMO_T_BASE+c2));
return 3;
}
}
}
public static final class ReorderingBuffer {
public ReorderingBuffer(Normalizer2Impl ni, StringBuilder dest) {
impl=ni;
str=dest;
}
public final void init(int destCapacity) {
str.ensureCapacity(destCapacity);
reorderStart=0;
if(str.length()==0) {
lastCC=0;
} else {
setIterator();
lastCC=previousCC();
// Set reorderStart after the last code point with cc<=1 if there is one.
if(lastCC>1) {
while(previousCC()>1) {}
}
reorderStart=codePointLimit;
}
}
public final boolean isEmpty() { return str.length()==0; }
public final int length() { return str.length(); }
public final int getLastCC() { return lastCC; }
public final void append(int c, int cc) {
if(lastCC<=cc || cc==0) {
str.appendCodePoint(c);
lastCC=cc;
if(cc<=1) {
reorderStart=str.length();
}
} else {
insert(c, cc);
}
}
// s must be in NFD, otherwise change the implementation.
public final void append(CharSequence s, int start, int length,
int leadCC, int trailCC) {
if(length==0) {
return;
}
if(lastCC<=leadCC || leadCC==0) {
if(trailCC<=1) {
reorderStart=str.length()+length;
} else if(leadCC<=1) {
reorderStart=str.length()+1; // Ok if not a code point boundary.
}
str.append(s, start, start+length);
lastCC=trailCC;
} else {
int limit=start+length;
int c=Character.codePointAt(s, start);
start+=Character.charCount(c);
insert(c, leadCC); // insert first code point
while(start<limit) {
c=Character.codePointAt(s, start);
start+=Character.charCount(c);
if(start<limit) {
// s must be in NFD, otherwise we need to use getCC().
leadCC=Normalizer2Impl.getCCFromYesOrMaybe(impl.getNorm16(c));
} else {
leadCC=trailCC;
}
append(c, leadCC);
}
}
}
public final void appendZeroCC(int c) {
str.appendCodePoint(c);
lastCC=0;
reorderStart=str.length();
}
public final void appendZeroCC(CharSequence s, int start, int length) {
if(length!=0) {
str.append(s, start, start+length);
lastCC=0;
reorderStart=str.length();
}
}
public final void removeZeroCCSuffix(int length) {
int oldLength=str.length();
str.delete(oldLength-length, oldLength);
lastCC=0;
reorderStart=str.length();
}
public final void setReorderingLimitAndLastCC(int newLimit, int newLastCC) {
str.delete(newLimit, str.length());
reorderStart=newLimit;
lastCC=newLastCC;
}
/*
* TODO: Revisit whether it makes sense to track reorderStart.
* It is set to after the last known character with cc<=1,
* which stops previousCC() before it reads that character and looks up its cc.
* previousCC() is normally only called from insert().
* In other words, reorderStart speeds up the insertion of a combining mark
* into a multi-combining mark sequence where it does not belong at the end.
* This might not be worth the trouble.
* On the other hand, it's not a huge amount of trouble.
*
* We probably need it for UNORM_SIMPLE_APPEND.
*/
// Inserts c somewhere before the last character.
// Requires 0<cc<lastCC which implies reorderStart<limit.
private final void insert(int c, int cc) {
for(setIterator(), skipPrevious(); previousCC()>cc;) {}
// insert c at codePointLimit, after the character with prevCC<=cc
if(c<=0xffff) {
str.insert(codePointLimit, (char)c);
if(cc<=1) {
reorderStart=codePointLimit+1;
}
} else {
str.insert(codePointLimit, Character.toChars(c));
if(cc<=1) {
reorderStart=codePointLimit+2;
}
}
}
private Normalizer2Impl impl;
private StringBuilder str;
private int reorderStart;
private int lastCC;
// private backward iterator
private void setIterator() { codePointStart=str.length(); }
private void skipPrevious() { // Requires 0<codePointStart.
codePointLimit=codePointStart;
codePointStart=str.offsetByCodePoints(codePointStart, -1);
}
private int previousCC() { // Returns 0 if there is no previous character.
codePointLimit=codePointStart;
if(reorderStart>=codePointStart) {
return 0;
}
int c=str.codePointBefore(codePointStart);
codePointStart-=Character.charCount(c);
if(c<Normalizer2Impl.MIN_CCC_LCCC_CP) {
return 0;
}
return Normalizer2Impl.getCCFromYesOrMaybe(impl.getNorm16(c));
}
private int codePointStart, codePointLimit;
}
public Normalizer2Impl() {}
private static final class Reader implements ICUBinary.Authenticate {
// @Override when we switch to Java 6
public boolean isDataVersionAcceptable(byte version[]) {
return version[0]==1;
}
public VersionInfo readHeader(InputStream data) throws IOException {
byte[] dataVersion=ICUBinary.readHeader(data, DATA_FORMAT, this);
return VersionInfo.getInstance(dataVersion[0], dataVersion[1],
dataVersion[2], dataVersion[3]);
}
private static final byte DATA_FORMAT[] = { 0x4e, 0x72, 0x6d, 0x32 }; // "Nrm2"
}
private static final Reader READER=new Reader();
public final void load(InputStream data) throws IOException {
BufferedInputStream bis=new BufferedInputStream(data);
dataVersion=READER.readHeader(bis);
DataInputStream ds=new DataInputStream(bis);
int indexesLength=ds.readInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4
if(indexesLength<=IX_MIN_MAYBE_YES) {
throw new IOException("Normalizer2 data: not enough indexes");
}
int[] inIndexes=new int[indexesLength];
inIndexes[0]=indexesLength*4;
for(int i=1; i<indexesLength; ++i) {
inIndexes[i]=ds.readInt();
}
minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
minYesNo=inIndexes[IX_MIN_YES_NO];
minNoNo=inIndexes[IX_MIN_NO_NO];
limitNoNo=inIndexes[IX_LIMIT_NO_NO];
minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
// Read the normTrie.
int offset=inIndexes[IX_NORM_TRIE_OFFSET];
int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
normTrie=Trie2_16.createFromSerialized(ds);
int trieLength=normTrie.getSerializedLength();
if(trieLength>(nextOffset-offset)) {
throw new IOException("Normalizer2 data: not enough bytes for normTrie");
}
ds.skipBytes((nextOffset-offset)-trieLength); // skip padding after trie bytes
// Read the composition and mapping data.
offset=nextOffset;
nextOffset=inIndexes[IX_RESERVED2_OFFSET];
int numChars=(nextOffset-offset)/2;
char[] chars;
if(numChars!=0) {
chars=new char[numChars];
for(int i=0; i<numChars; ++i) {
chars[i]=ds.readChar();
}
maybeYesCompositions=new String(chars);
extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes);
}
data.close();
}
public final void load(ClassLoader root, String name) throws IOException {
load(ICUData.getRequiredStream(root, name));
}
public final void addPropertyStarts(UnicodeSet sa) {
// TODO
}
// low-level properties ------------------------------------------------ ***
public final Trie2_16 getNormTrie() { return normTrie; }
public final Trie2_16 getFCDTrie() {
return fcdTrie; // TODO: build if necessary, with synchronization
}
public final int getNorm16(int c) { return normTrie.get(c); }
/*
UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const {
if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
return UNORM_YES;
} else if(minMaybeYes<=norm16) {
return UNORM_MAYBE;
} else {
return UNORM_NO;
}
}
UBool isCompNo(uint16_t norm16) const { return minNoNo<=norm16 && norm16<minMaybeYes; }
UBool isDecompYes(uint16_t norm16) const { return norm16<minYesNo || minMaybeYes<=norm16; }
public final int getCC(int norm16) {
if(norm16>=MIN_NORMAL_MAYBE_YES) {
return norm16&0xff;
}
if(norm16<minNoNo || limitNoNo<=norm16) {
return 0;
}
return getCCFromNoNo(norm16);
}
*/
public static final int getCCFromYesOrMaybe(int norm16) {
return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0;
}
/*
uint16_t getFCD16(UChar32 c) const { return UTRIE2_GET16(fcdTrie(), c); }
uint16_t getFCD16FromBMP(UChar c) const { return UTRIE2_GET16(fcdTrie(), c); }
uint16_t getFCD16FromSingleLead(UChar c) const {
return UTRIE2_GET16_FROM_U16_SINGLE_LEAD(fcdTrie(), c);
}
uint16_t getFCD16FromSupplementary(UChar32 c) const {
return UTRIE2_GET16_FROM_SUPP(fcdTrie(), c);
}
uint16_t getFCD16FromSurrogatePair(UChar c, UChar c2) const {
return getFCD16FromSupplementary(U16_GET_SUPPLEMENTARY(c, c2));
}
void setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
UTrie2 *newFCDTrie) const;
*/
/**
* Get the decomposition for one code point.
* @param c code point
* @param buffer out-only buffer gets the decomposition appended
* @return true if c has a decomposition
*/
public final boolean getDecomposition(int c, StringBuilder buffer) {
return false; // TODO
}
public static final int MIN_CCC_LCCC_CP=0x300;
public static final int MIN_YES_YES_WITH_CC=0xff01;
public static final int JAMO_VT=0xff00;
public static final int MIN_NORMAL_MAYBE_YES=0xfe00;
public static final int JAMO_L=1;
public static final int MAX_DELTA=0x40;
// Byte offsets from the start of the data, after the generic header.
public static final int IX_NORM_TRIE_OFFSET=0;
public static final int IX_EXTRA_DATA_OFFSET=1;
public static final int IX_RESERVED2_OFFSET=2;
public static final int IX_TOTAL_SIZE=7;
// Code point thresholds for quick check codes.
public static final int IX_MIN_DECOMP_NO_CP=8;
public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
// Norm16 value thresholds for quick check combinations and types of extra data.
public static final int IX_MIN_YES_NO=10;
public static final int IX_MIN_NO_NO=11;
public static final int IX_LIMIT_NO_NO=12;
public static final int IX_MIN_MAYBE_YES=13;
public static final int IX_COUNT=16;
public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
public static final int MAPPING_PLUS_COMPOSITION_LIST=0x40;
public static final int MAPPING_NO_COMP_BOUNDARY_AFTER=0x20;
public static final int MAPPING_LENGTH_MASK=0x1f;
public static final int COMP_1_LAST_TUPLE=0x8000;
public static final int COMP_1_TRIPLE=1;
public static final int COMP_1_TRAIL_LIMIT=0x3400;
public static final int COMP_1_TRAIL_MASK=0x7ffe;
public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit
public static final int COMP_2_TRAIL_SHIFT=6;
public static final int COMP_2_TRAIL_MASK=0xffc0;
// higher-level functionality ------------------------------------------ ***
/*
const UChar *decompose(const UChar *src, const UChar *limit,
ReorderingBuffer *buffer) const;
void decomposeAndAppend(const UChar *src, const UChar *limit,
UBool doDecompose,
ReorderingBuffer &buffer,
UErrorCode &errorCode) const;
UBool compose(const UChar *src, const UChar *limit,
UBool onlyContiguous,
UBool doCompose,
ReorderingBuffer &buffer,
UErrorCode &errorCode) const;
const UChar *composeQuickCheck(const UChar *src, const UChar *limit,
UBool onlyContiguous,
UNormalizationCheckResult *pQCResult) const;
void composeAndAppend(const UChar *src, const UChar *limit,
UBool doCompose,
UBool onlyContiguous,
ReorderingBuffer &buffer,
UErrorCode &errorCode) const;
const UChar *makeFCD(const UChar *src, const UChar *limit,
ReorderingBuffer *buffer) const;
void makeFCDAndAppend(const UChar *src, const UChar *limit,
UBool doMakeFCD,
ReorderingBuffer &buffer,
UErrorCode &errorCode) const;
UBool hasDecompBoundary(UChar32 c, UBool before) const;
UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); }
UBool hasCompBoundaryBefore(UChar32 c) const {
return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));
}
UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const;
UBool hasFCDBoundaryBefore(UChar32 c) const { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; }
UBool hasFCDBoundaryAfter(UChar32 c) const {
uint16_t fcd16=getFCD16(c);
return fcd16<=1 || (fcd16&0xff)==0;
}
UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; }
*/
/* private ----
static UBool U_CALLCONV
isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; }
static UBool isInert(uint16_t norm16) { return norm16==0; }
// static UBool isJamoL(uint16_t norm16) const { return norm16==1; }
static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; }
UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; }
UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; }
// UBool isCompYes(uint16_t norm16) const {
// return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
// }
// UBool isCompYesOrMaybe(uint16_t norm16) const {
// return norm16<minNoNo || minMaybeYes<=norm16;
// }
UBool hasZeroCCFromDecompYes(uint16_t norm16) {
return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
}
UBool isDecompYesAndZeroCC(uint16_t norm16) const {
return norm16<minYesNo ||
norm16==JAMO_VT ||
(minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
}
*/
/**
* A little faster and simpler than isDecompYesAndZeroCC() but does not include
* the MaybeYes which combine-forward and have ccc=0.
* (Standard Unicode 5.2 normalization does not have such characters.)
*/
/*
UBool isMostDecompYesAndZeroCC(uint16_t norm16) const {
return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
}
UBool isDecompNoAlgorithmic(uint16_t norm16) const { return norm16>=limitNoNo; }
// For use with isCompYes().
// Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
// static uint8_t getCCFromYes(uint16_t norm16) {
// return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
// }
uint8_t getCCFromNoNo(uint16_t norm16) const {
const uint16_t *mapping=getMapping(norm16);
if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) {
return (uint8_t)mapping[1];
} else {
return 0;
}
}
// requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
uint8_t getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const;
// Requires algorithmic-NoNo.
UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const {
return c+norm16-(minMaybeYes-MAX_DELTA-1);
}
// Requires minYesNo<norm16<limitNoNo.
const uint16_t *getMapping(uint16_t norm16) const { return extraData+norm16; }
const uint16_t *getCompositionsListForDecompYesAndZeroCC(uint16_t norm16) const {
if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
return NULL;
} else if(norm16<minMaybeYes) {
return extraData+norm16; // for yesYes; if Jamo L: harmless empty list
} else {
return maybeYesCompositions+norm16-minMaybeYes;
}
}
const uint16_t *getCompositionsListForComposite(uint16_t norm16) const {
const uint16_t *list=extraData+norm16; // composite has both mapping & compositions list
return list+ // mapping pointer
1+ // +1 to skip the first unit with the mapping lenth
(*list&MAPPING_LENGTH_MASK)+ // + mapping length
((*list>>7)&1); // +1 if MAPPING_HAS_CCC_LCCC_WORD
}
const UChar *copyLowPrefixFromNulTerminated(const UChar *src,
UChar32 minNeedDataCP,
ReorderingBuffer *buffer,
UErrorCode &errorCode) const;
UBool decomposeShort(const UChar *src, const UChar *limit,
ReorderingBuffer &buffer) const;
UBool decompose(UChar32 c, uint16_t norm16,
ReorderingBuffer &buffer) const;
static int32_t combine(const uint16_t *list, UChar32 trail);
void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
UBool onlyContiguous) const;
UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const;
const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) const;
const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const;
const UTrie2 *fcdTrie() const { return (const UTrie2 *)fcdTrieSingleton.fInstance; }
const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const;
const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const;
*/
VersionInfo dataVersion;
// Code point thresholds for quick check codes.
int minDecompNoCP;
int minCompNoMaybeCP;
// Norm16 value thresholds for quick check combinations and types of extra data.
int minYesNo;
int minNoNo;
int limitNoNo;
int minMaybeYes;
Trie2_16 normTrie;
String maybeYesCompositions;
String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
Trie2_16 fcdTrie;
}
// TODO: Copy parts of normalizer2impl.h starting with Normalizer2Factory??