| /* |
| ******************************************************************************* |
| * Copyright (C) 1996-2000, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/NormalizerImpl.java,v $ |
| * $Date: 2002/03/28 01:50:59 $ |
| * $Revision: 1.4 $ |
| ******************************************************************************* |
| */ |
| |
| package com.ibm.icu.impl; |
| import java.io.*; |
| //import com.ibm.icu.text.NewNormalizer; |
| import com.ibm.icu.text.UTF16; |
| /** |
| * @version 1.0 |
| * @author Ram Viswanadha |
| */ |
| public final class NormalizerImpl { |
| /* Static block for the class to initialize its own self */ |
| static NormalizerImpl IMPL=null; |
| |
| static |
| { |
| try |
| { |
| IMPL = new NormalizerImpl(); |
| } |
| catch (Exception e) |
| { |
| throw new RuntimeException(e.getMessage()); |
| } |
| } |
| |
| static final int UNSIGNED_BYTE_MASK =0xFF; |
| /* |
| * This new implementation of the normalization code loads its data from |
| * unorm.dat, which is generated with the gennorm tool. |
| * The format of that file is described at the end of this file. |
| */ |
| private static final String DATA_FILE_NAME_ = "data/unorm.dat"; |
| |
| /* norm32 value constants */ |
| |
| /* quick check flags 0..3 set mean "no" for their forms */ |
| static final int QC_NFC=0x11; /* no|maybe */ |
| static final int QC_NFKC=0x22; /* no|maybe */ |
| static final int QC_NFD=4; /* no */ |
| static final int QC_NFKD=8; /* no */ |
| |
| static final int QC_ANY_NO=0xf; |
| |
| /* quick check flags 4..5 mean "maybe" for their forms; test flags>=QC_MAYBE */ |
| static final int QC_MAYBE=0x10; |
| static final int QC_ANY_MAYBE=0x30; |
| |
| static final int QC_MASK=0x3f; |
| |
| static final int COMBINES_FWD=0x40; |
| static final int COMBINES_BACK=0x80; |
| static final int COMBINES_ANY=0xc0; |
| |
| static final int CC_SHIFT=8; /* UnicodeData.txt combining class in bits 15..8 */ |
| static final int CC_MASK=0xff00; |
| |
| static final int EXTRA_SHIFT=16; /* 16 bits for the index to UChars and other extra data */ |
| static final int EXTRA_INDEX_TOP=0xfc00; /* start of surrogate specials after shift */ |
| |
| static final int EXTRA_SURROGATE_MASK=0x3ff; |
| static final int EXTRA_SURROGATE_TOP=0x3f0; /* hangul etc. */ |
| |
| static final int EXTRA_HANGUL=EXTRA_SURROGATE_TOP; |
| static final int EXTRA_JAMO_L=EXTRA_SURROGATE_TOP+1; /* ### not used */ |
| static final int EXTRA_JAMO_V=EXTRA_SURROGATE_TOP+2; |
| static final int EXTRA_JAMO_T=EXTRA_SURROGATE_TOP+3; |
| |
| /* norm32 value constants using >16 bits */ |
| static final int UNSIGNED_INT_MASK = 0x7fffffff; |
| static final int MIN_SPECIAL = 0xfc000000 & UNSIGNED_INT_MASK; |
| static final int SURROGATES_TOP = 0xfff00000 & UNSIGNED_INT_MASK; |
| static final int MIN_HANGUL = 0xfff00000 & UNSIGNED_INT_MASK; |
| static final int MIN_JAMO_V = 0xfff20000 & UNSIGNED_INT_MASK; |
| static final int JAMO_V_TOP = 0xfff30000 & UNSIGNED_INT_MASK; |
| |
| |
| /* indexes[] value names */ |
| |
| static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */ |
| static final int INDEX_CHAR_COUNT = 1; /* number of chars in extra data */ |
| |
| static final int INDEX_COMBINE_DATA_COUNT = 2; /* number of uint16_t words for combining data */ |
| static final int INDEX_COMBINE_FWD_COUNT = 3; /* number of code points that combine forward */ |
| static final int INDEX_COMBINE_BOTH_COUNT = 4; /* number of code points that combine forward and backward */ |
| static final int INDEX_COMBINE_BACK_COUNT = 5; /* number of code points that combine backward */ |
| |
| static final int INDEX_MIN_NFC_NO_MAYBE = 6; /* first code point with quick check NFC NO/MAYBE */ |
| static final int INDEX_MIN_NFKC_NO_MAYBE = 7; /* first code point with quick check NFKC NO/MAYBE */ |
| static final int INDEX_MIN_NFD_NO_MAYBE = 8; /* first code point with quick check NFD NO/MAYBE */ |
| static final int INDEX_MIN_NFKD_NO_MAYBE = 9; /* first code point with quick check NFKD NO/MAYBE */ |
| |
| static final int INDEX_FCD_TRIE_SIZE = 10; /* number of bytes in FCD trie */ |
| static final int INDEX_AUX_TRIE_SIZE = 11; /* number of bytes in the auxiliary trie */ |
| static final int INDEX_CANON_SET_COUNT = 12; /* number of uint16_t in the array of serialized USet */ |
| |
| static final int INDEX_TOP = 32; /* changing this requires a new formatVersion */ |
| |
| |
| /* AUX constants */ |
| /* value constants for auxTrie */ |
| static final int AUX_UNSAFE_SHIFT = 11; |
| static final int AUX_COMP_EX_SHIFT = 10; |
| |
| static final int AUX_MAX_FNC = ((int)1<<AUX_COMP_EX_SHIFT); |
| static final int AUX_UNSAFE_MASK = (1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK; |
| static final int AUX_FNC_MASK = (AUX_MAX_FNC-1) & UNSIGNED_INT_MASK; |
| static final int AUX_COMP_EX_MASK = (1<<AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK; |
| |
| /* canonStartSets[0..31] contains indexes for what is in the array */ |
| static final int SET_INDEX_CANON_SETS_LENGTH = 0; /* number of uint16_t in canonical starter sets */ |
| static final int SET_INDEX_CANON_BMP_TABLE_LENGTH = 1; /* number of uint16_t in the BMP search table (contains pairs) */ |
| static final int SET_INDEX_CANON_SUPP_TABLE_LENGTH = 2; /* number of uint16_t in the supplementary search table (contains triplets) */ |
| static final int SET_INDEX_TOP = 32;/* changing this requires a new formatVersion */ |
| |
| static final int CANON_SET_INDICIES_INDEX = 0; |
| static final int CANON_SET_START_SETS_INDEX = 1; |
| static final int CANON_SET_BMP_TABLE_INDEX = 2; |
| static final int CANON_SET_SUPP_TABLE_INDEX = 3; |
| |
| static final int CANON_SET_MAX_CANON_SETS = 0x0004; /* 14 bit indexes to canonical USerializedSets */ |
| /* single-code point BMP sets are encoded directly in the search table except if result=0x4000..0x7fff */ |
| static final int CANON_SET_BMP_MASK = 0xc000; |
| static final int CANON_SET_BMP_IS_INDEX = 0x4000; |
| |
| /*******************************/ |
| |
| /* Wrappers for Trie implementations */ |
| static final class NormTrieImpl implements Trie.DataManipulate{ |
| static IntTrie normTrie= null; |
| /** |
| * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's |
| * data the index array offset of the indexes for that lead surrogate. |
| * @param property data value for a surrogate from the trie, including the |
| * folding offset |
| * @return data offset or 0 if there is no data for the lead surrogate |
| */ |
| public int getFoldingOffset(int value){ |
| |
| return 0x10000>>5+((value>>(EXTRA_SHIFT-5))&(0x3ff<<5)); |
| |
| } |
| |
| } |
| static final class FCDTrieImpl implements Trie.DataManipulate{ |
| static CharTrie fcdTrie=null; |
| /** |
| * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's |
| * data the index array offset of the indexes for that lead surrogate. |
| * @param property data value for a surrogate from the trie, including the |
| * folding offset |
| * @return data offset or 0 if there is no data for the lead surrogate |
| */ |
| |
| public int getFoldingOffset(int value){ |
| return 0; |
| } |
| } |
| |
| static final class AuxTrieImpl implements Trie.DataManipulate{ |
| static CharTrie auxTrie = null; |
| /** |
| * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's |
| * data the index array offset of the indexes for that lead surrogate. |
| * @param property data value for a surrogate from the trie, including the |
| * folding offset |
| * @return data offset or 0 if there is no data for the lead surrogate |
| */ |
| public int getFoldingOffset(int value){ |
| return (value&AUX_FNC_MASK)<<5; |
| } |
| } |
| |
| /****************************************************/ |
| |
| |
| static FCDTrieImpl fcdTrieImpl; |
| static NormTrieImpl normTrieImpl; |
| static AuxTrieImpl auxTrieImpl; |
| static int[] indexes; |
| static char[] combiningTable; |
| static char[] extraData; |
| static Object[] canonStartSets; |
| |
| static boolean isDataLoaded; |
| static boolean isFormatVersion_2_1; |
| /** |
| * Default buffer size of datafile |
| */ |
| private static final int DATA_BUFFER_SIZE_ = 25000; |
| |
| /* FCD check: everything below this code point is known to have a 0 lead combining class */ |
| public static final int MIN_WITH_LEAD_CC=0x300; |
| |
| |
| /** |
| * Bit 7 of the length byte for a decomposition string in extra data is |
| * a flag indicating whether the decomposition string is |
| * preceded by a 16-bit word with the leading and trailing cc |
| * of the decomposition (like for A-umlaut); |
| * if not, then both cc's are zero (like for compatibility ideographs). |
| */ |
| static final int DECOMP_FLAG_LENGTH_HAS_CC=0x80; |
| /** |
| * Bits 6..0 of the length byte contain the actual length. |
| */ |
| static final int DECOMP_LENGTH_MASK=0x7f; |
| |
| /* -------------------------------------------------------------------------- */ |
| |
| /* Korean Hangul and Jamo constants */ |
| |
| private static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ |
| private static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ |
| private static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ |
| |
| private static final int HANGUL_BASE=0xac00; |
| |
| private static final int JAMO_L_COUNT=19; |
| private static final int JAMO_V_COUNT=21; |
| private static final int JAMO_T_COUNT=28; |
| private static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; |
| |
| private static boolean isHangulWithoutJamoT(char c) { |
| c-=HANGUL_BASE; |
| return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; |
| } |
| |
| /* norm32 helpers */ |
| |
| /* is this a norm32 with a regular index? */ |
| private static boolean isNorm32Regular(int norm32) { |
| return norm32<MIN_SPECIAL; |
| } |
| |
| /* is this a norm32 with a special index for a lead surrogate? */ |
| private static boolean isNorm32LeadSurrogate(int norm32) { |
| return MIN_SPECIAL<=norm32 && norm32<SURROGATES_TOP; |
| } |
| |
| /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */ |
| private static boolean isNorm32HangulOrJamo(int norm32) { |
| return norm32>=MIN_HANGUL; |
| } |
| |
| /* |
| * Given isNorm32HangulOrJamo(), |
| * is this a Hangul syllable or a Jamo? |
| */ |
| private static boolean isHangulJamoNorm32HangulOrJamoL(int norm32) { |
| return norm32<MIN_JAMO_V; |
| } |
| |
| /* |
| * Given norm32 for Jamo V or T, |
| * is this a Jamo V? |
| */ |
| private static boolean isJamoVTNorm32JamoV(int norm32) { |
| return norm32<JAMO_V_TOP; |
| } |
| |
| static int getExtraDataIndex(int norm32) { |
| return (norm32>>EXTRA_SHIFT); |
| } |
| |
| |
| // protected constructor --------------------------------------------- |
| |
| /** |
| * Constructor |
| * @exception thrown when data reading fails or data corrupted |
| */ |
| private NormalizerImpl() throws IOException{ |
| //data should be loaded only once |
| if(!isDataLoaded){ |
| indexes = null; |
| combiningTable=null; |
| extraData=null; |
| fcdTrieImpl = new FCDTrieImpl(); |
| normTrieImpl = new NormTrieImpl(); |
| auxTrieImpl = new AuxTrieImpl(); |
| // jar access |
| InputStream i = getClass().getResourceAsStream(DATA_FILE_NAME_); |
| BufferedInputStream b = new BufferedInputStream(i, |
| DATA_BUFFER_SIZE_); |
| NormalizerDataReader reader = new NormalizerDataReader(b); |
| reader.read(this); |
| b.close(); |
| i.close(); |
| } |
| } |
| |
| public static boolean checkFCD(char[] src) { |
| |
| char fcd16,c; |
| int prevCC=0, cc; |
| int i =0, length = src.length; |
| |
| for(;;) { |
| for(;;) { |
| if(i==length) { |
| return true; |
| } else if((c=src[i++])<MIN_WITH_LEAD_CC) { |
| prevCC=(int)-c; |
| } else if((fcd16=fcdTrieImpl.fcdTrie.getBMPValue(c))==0) { |
| prevCC=0; |
| } else { |
| break; |
| } |
| } |
| |
| /* check one above-minimum, relevant code unit */ |
| if(UTF16.isLeadSurrogate(c)) { |
| /* c is a lead surrogate, get the real fcd16 */ |
| if(i!=length && UTF16.isTrailSurrogate(src[i])) { |
| ++i; |
| fcd16=fcdTrieImpl.fcdTrie.getSurrogateValue(fcd16, src[i]); |
| } else { |
| fcd16=0; |
| } |
| } |
| |
| /* |
| * prevCC has values from the following ranges: |
| * 0..0xff - the previous trail combining class |
| * <0 - the negative value of the previous code unit; |
| * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16() |
| * was deferred so that average text is checked faster |
| */ |
| |
| /* check the combining order */ |
| cc=(int)(fcd16>>8); |
| if(cc!=0) { |
| if(prevCC<0) { |
| /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */ |
| prevCC=(int)(fcdTrieImpl.fcdTrie.getBMPValue((char)-prevCC)&0xff); |
| } |
| |
| if(cc<prevCC) { |
| return false; |
| } |
| } |
| prevCC=(int)(fcd16&0xff); |
| } |
| } |
| /* |
| public static NewNormalizer.QuickCheckResult quickCheck(char[] src,NewNormalizer.Mode mode) { |
| |
| int norm32, ccOrQCMask, qcMask; |
| char c, c2, minNoMaybe; |
| char cc, prevCC; |
| NewNormalizer.QuickCheckResult result; |
| |
| |
| |
| if(!isDataLoaded) { |
| return NewNormalizer.MAYBE; |
| } |
| |
| // check for a valid mode and set the quick check minimum and mask |
| |
| if(mode.equals(NewNormalizer.NFC)){ |
| minNoMaybe=(char)indexes[INDEX_MIN_NFC_NO_MAYBE]; |
| qcMask=QC_NFC; |
| }else if(mode.equals(NewNormalizer.NFKC)){ |
| minNoMaybe=(char)indexes[INDEX_MIN_NFKC_NO_MAYBE]; |
| qcMask=QC_NFKC; |
| }else if(mode.equals(NewNormalizer.NFD)){ |
| minNoMaybe=(char)indexes[INDEX_MIN_NFD_NO_MAYBE]; |
| qcMask=QC_NFD; |
| }else if(mode.equals(NewNormalizer.NFKD)){ |
| minNoMaybe=(char)indexes[INDEX_MIN_NFKD_NO_MAYBE]; |
| qcMask=QC_NFKD; |
| }else if(mode.equals(NewNormalizer.FCD)){ |
| return (checkFCD(src)) ? NewNormalizer.YES : NewNormalizer.NO; |
| }else{ |
| return NewNormalizer.MAYBE; |
| } |
| |
| // initialize |
| ccOrQCMask=CC_MASK|qcMask; |
| result=NewNormalizer.YES; |
| prevCC=0; |
| int i=0; |
| |
| for(;;) { |
| for(;;) { |
| if(i==src.length) { |
| return result; |
| } else if((c=src[i++])>=minNoMaybe && ((norm32=normTrieImpl.normTrie.getBMPValue(c))&ccOrQCMask)!=0) { |
| break; |
| } |
| prevCC=0; |
| } |
| |
| |
| //* check one above-minimum, relevant code unit |
| if(isNorm32LeadSurrogate(norm32)) { |
| //* c is a lead surrogate, get the real norm32 |
| if(i!=src.length && UTF16.isTrailSurrogate(c2=src[i])) { |
| ++i; |
| norm32=normTrieImpl.normTrie.getRawOffset(norm32, c2); |
| } else { |
| norm32=0; |
| } |
| } |
| |
| //* check the combining order |
| cc=(char)((norm32>>CC_SHIFT)&0xFF); |
| if(cc!=0 && cc<prevCC) { |
| return NewNormalizer.NO; |
| } |
| prevCC=cc; |
| |
| //* check for "no" or "maybe" quick check flags |
| norm32&=qcMask; |
| if((norm32& QC_ANY_NO)>=1) { |
| return NewNormalizer.NO; |
| } else if(norm32!=0) { |
| result=NewNormalizer.MAYBE; |
| } |
| } |
| } */ |
| |
| public static int getCombiningClass(int c) { |
| int norm32; |
| if(c<=0xffff) { |
| norm32=normTrieImpl.normTrie.getBMPValue((char)c); |
| } else { |
| norm32=normTrieImpl.normTrie.getBMPValue(UTF16.getLeadSurrogate(c)); |
| if((norm32&CC_MASK)!=0) { |
| norm32=normTrieImpl.normTrie.getRawOffset(norm32, UTF16.getTrailSurrogate(c)); |
| } |
| } |
| return (char)((norm32>>CC_SHIFT)&0xFF); |
| } |
| |
| public static boolean isFullCompositionExclusion(int c) { |
| if(isFormatVersion_2_1) { |
| int aux =auxTrieImpl.auxTrie.getCodePointValue(c); |
| return (boolean)((aux & AUX_COMP_EX_MASK)!=0); |
| } else { |
| return false; |
| } |
| } |
| |
| public static boolean isCanonSafeStart(int c) { |
| if(isFormatVersion_2_1) { |
| int aux = auxTrieImpl.auxTrie.getCodePointValue(c); |
| return (boolean)((aux & AUX_UNSAFE_MASK)==0); |
| } else { |
| return false; |
| } |
| } |
| |
| public static boolean getCanonStartSet(int c, USerializedSet fillSet) { |
| |
| if(fillSet!=null && canonStartSets!=null) { |
| /* |
| * binary search for c |
| * |
| * There are two search tables, |
| * one for BMP code points and one for supplementary ones. |
| * See unormimp.h for details. |
| */ |
| char[] table; |
| int i, start, limit; |
| |
| if(c<=0xffff) { |
| table=(char[]) canonStartSets[CANON_SET_BMP_TABLE_INDEX]; |
| start=0; |
| limit=table.length; |
| |
| /* each entry is a pair { c, result } */ |
| while(start<limit) { |
| i=(char)((start+limit)/2); |
| if(c<table[i]) { |
| limit=i; |
| } else { |
| start=i; |
| } |
| } |
| |
| /* found? */ |
| if(c==table[start]) { |
| i=table[start+1]; |
| if((i&CANON_SET_BMP_MASK)==CANON_SET_BMP_IS_INDEX) { |
| /* result 01xxxxxx xxxxxx contains index x to a USerializedSet */ |
| i&=(CANON_SET_MAX_CANON_SETS-1); |
| return fillSet.getSet(table,i); |
| } else { |
| /* other result values are BMP code points for single-code point sets */ |
| fillSet.setSerializedToOne(i); |
| return true; |
| } |
| } |
| } else { |
| char high, low, h; |
| |
| table=(char[]) canonStartSets[CANON_SET_SUPP_TABLE_INDEX]; |
| start=0; |
| limit=table.length; |
| |
| high=(char)(c>>16); |
| low=(char)c; |
| |
| /* each entry is a triplet { high(c), low(c), result } */ |
| while(start<limit-3) { |
| i=(char)(((start+limit)/6)*3); /* (start+limit)/2 and address triplets */ |
| h=(char)(table[i]&0x1f); /* high word */ |
| if(high<h || (high==h && low<table[i+1])) { |
| limit=i; |
| } else { |
| start=i; |
| } |
| } |
| |
| /* found? */ |
| h=table[start]; |
| if(high==(h&0x1f) && low==table[start+1]) { |
| i=table[start+2]; |
| if((h&0x8000)==0) { |
| /* the result is an index to a USerializedSet */ |
| return fillSet.getSet(table,i); |
| } else { |
| /* |
| * single-code point set {x} in |
| * triplet { 100xxxxx 000hhhhh llllllll llllllll xxxxxxxx xxxxxxxx } |
| */ |
| i|=((int)h&0x1f00)<<8; /* add high bits from high(c) */ |
| fillSet.setSerializedToOne((int)i); |
| return true; |
| } |
| } |
| } |
| } |
| |
| return false; /* not found */ |
| } |
| |
| /** |
| * Internal API, used by collation code. |
| * Get access to the internal FCD trie table to be able to perform |
| * incremental, per-code unit, FCD checks in collation. |
| * One pointer is sufficient because the trie index values are offset |
| * by the index size, so that the same pointer is used to access the trie data. |
| * @internal |
| */ |
| public CharTrie getFCDTrie(){ |
| return fcdTrieImpl.fcdTrie; |
| } |
| |
| } |