| /** |
| ******************************************************************************* |
| * Copyright (C) 2004-2009, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| package com.ibm.icu.impl; |
| |
| /** |
| * For generation of Implicit CEs |
| * @author Mark Davis |
| * |
| * Cleaned up so that changes can be made more easily. |
| * Old values: |
| # First Implicit: E26A792D |
| # Last Implicit: E3DC70C0 |
| # First CJK: E0030300 |
| # Last CJK: E0A9DD00 |
| # First CJK_A: E0A9DF00 |
| # Last CJK_A: E0DE3100 |
| @internal |
| */ |
| public class ImplicitCEGenerator { |
| |
| /** |
| * constants |
| */ |
| static final boolean DEBUG = false; |
| |
| static final long topByte = 0xFF000000L; |
| static final long bottomByte = 0xFFL; |
| static final long fourBytes = 0xFFFFFFFFL; |
| |
| static final int MAX_INPUT = 0x220001; // 2 * Unicode range + 2 |
| |
| public static final int CJK_BASE = 0x4E00; |
| public static final int CJK_LIMIT = 0x9FFF+1; |
| public static final int CJK_COMPAT_USED_BASE = 0xFA0E; |
| public static final int CJK_COMPAT_USED_LIMIT = 0xFA2F+1; |
| public static final int CJK_A_BASE = 0x3400; |
| public static final int CJK_A_LIMIT = 0x4DBF+1; |
| public static final int CJK_B_BASE = 0x20000; |
| public static final int CJK_B_LIMIT = 0x2A6DF+1; |
| |
| // private void throwError(String title, int cp) { |
| // throw new IllegalArgumentException(title + "\t" + Utility.hex(cp, 6) + "\t" + |
| // Utility.hex(getImplicitFromRaw(cp) & fourBytes)); |
| // } |
| // |
| // private void throwError(String title, long ce) { |
| // throw new IllegalArgumentException(title + "\t" + Utility.hex(ce & fourBytes)); |
| // } |
| // |
| // private void show(int i) { |
| // if (i >= 0 && i <= MAX_INPUT) { |
| // System.out.println(Utility.hex(i) + "\t" + Utility.hex(getImplicitFromRaw(i) & fourBytes)); |
| // } |
| // } |
| |
| /** |
| * Precomputed by constructor |
| */ |
| int final3Multiplier; |
| int final4Multiplier; |
| int final3Count; |
| int final4Count; |
| int medialCount; |
| int min3Primary; |
| int min4Primary; |
| int max4Primary; |
| int minTrail; |
| int maxTrail; |
| int max3Trail; |
| int max4Trail; |
| int min4Boundary; |
| |
| public int getGap4() { |
| return final4Multiplier - 1; |
| } |
| |
| public int getGap3() { |
| return final3Multiplier - 1; |
| } |
| |
| // old comment |
| // we must skip all 00, 01, 02, FF bytes, so most bytes have 252 values |
| // we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case) |
| // we shift so that HAN all has the same first primary, for compression. |
| // for the 4 byte case, we make the gap as large as we can fit. |
| |
| /** |
| * Supply parameters for generating implicit CEs |
| */ |
| public ImplicitCEGenerator(int minPrimary, int maxPrimary) { |
| // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms. |
| this(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1); |
| } |
| |
| /** |
| * Set up to generate implicits. |
| * @param minPrimary The minimum primary value. |
| * @param maxPrimary The maximum primary value. |
| * @param minTrail final byte |
| * @param maxTrail final byte |
| * @param gap3 the gap we leave for tailoring for 3-byte forms |
| * @param primaries3count number of 3-byte primarys we can use (normally 1) |
| */ |
| public ImplicitCEGenerator(int minPrimary, int maxPrimary, int minTrail, int maxTrail, int gap3, int primaries3count) { |
| // some simple parameter checks |
| if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) { |
| throw new IllegalArgumentException("bad lead bytes"); |
| } |
| if (minTrail < 0 || minTrail >= maxTrail || maxTrail > 0xFF) { |
| throw new IllegalArgumentException("bad trail bytes"); |
| } |
| if (primaries3count < 1) { |
| throw new IllegalArgumentException("bad three-byte primaries"); |
| } |
| |
| this.minTrail = minTrail; |
| this.maxTrail = maxTrail; |
| |
| min3Primary = minPrimary; |
| max4Primary = maxPrimary; |
| // compute constants for use later. |
| // number of values we can use in trailing bytes |
| // leave room for empty values between AND above, e.g. if gap = 2 |
| // range 3..7 => +3 -4 -5 -6 -7: so 1 value |
| // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values |
| // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values |
| final3Multiplier = gap3 + 1; |
| final3Count = (maxTrail - minTrail + 1) / final3Multiplier; |
| max3Trail = minTrail + (final3Count - 1) * final3Multiplier; |
| |
| // medials can use full range |
| medialCount = (maxTrail - minTrail + 1); |
| // find out how many values fit in each form |
| int threeByteCount = medialCount * final3Count; |
| // now determine where the 3/4 boundary is. |
| // we use 3 bytes below the boundary, and 4 above |
| int primariesAvailable = maxPrimary - minPrimary + 1; |
| int primaries4count = primariesAvailable - primaries3count; |
| |
| int min3ByteCoverage = primaries3count * threeByteCount; |
| min4Primary = minPrimary + primaries3count; |
| min4Boundary = min3ByteCoverage; |
| // Now expand out the multiplier for the 4 bytes, and redo. |
| |
| int totalNeeded = MAX_INPUT - min4Boundary; |
| int neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count); |
| if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte); |
| |
| int neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount); |
| if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte); |
| |
| int gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte; |
| if (DEBUG) System.out.println("expandedGap: " + gap4); |
| if (gap4 < 1) throw new IllegalArgumentException("must have larger gap4s"); |
| |
| final4Multiplier = gap4 + 1; |
| final4Count = neededPerFinalByte; |
| max4Trail = minTrail + (final4Count - 1) * final4Multiplier; |
| |
| if (primaries4count * medialCount * medialCount * final4Count < MAX_INPUT) { |
| throw new IllegalArgumentException("internal error"); |
| } |
| if (DEBUG) { |
| System.out.println("final4Count: " + final4Count); |
| for (int counter = 0; counter < final4Count; ++counter) { |
| int value = minTrail + (1 + counter)*final4Multiplier; |
| System.out.println(counter + "\t" + value + "\t" + Utility.hex(value)); |
| } |
| } |
| } |
| |
| static public int divideAndRoundUp(int a, int b) { |
| return 1 + (a-1)/b; |
| } |
| |
| /** |
| * Converts implicit CE into raw integer |
| * @param implicit The implicit value passed. |
| * @return -1 if illegal format |
| */ |
| public int getRawFromImplicit(int implicit) { |
| int result; |
| int b3 = implicit & 0xFF; |
| implicit >>= 8; |
| int b2 = implicit & 0xFF; |
| implicit >>= 8; |
| int b1 = implicit & 0xFF; |
| implicit >>= 8; |
| int b0 = implicit & 0xFF; |
| |
| // simple parameter checks |
| if (b0 < min3Primary || b0 > max4Primary |
| || b1 < minTrail || b1 > maxTrail) return -1; |
| // normal offsets |
| b1 -= minTrail; |
| |
| // take care of the final values, and compose |
| if (b0 < min4Primary) { |
| if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1; |
| b2 -= minTrail; |
| int remainder = b2 % final3Multiplier; |
| if (remainder != 0) return -1; |
| b0 -= min3Primary; |
| b2 /= final3Multiplier; |
| result = ((b0 * medialCount) + b1) * final3Count + b2; |
| } else { |
| if (b2 < minTrail || b2 > maxTrail |
| || b3 < minTrail || b3 > max4Trail) return -1; |
| b2 -= minTrail; |
| b3 -= minTrail; |
| int remainder = b3 % final4Multiplier; |
| if (remainder != 0) return -1; |
| b3 /= final4Multiplier; |
| b0 -= min4Primary; |
| result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary; |
| } |
| // final check |
| if (result < 0 || result > MAX_INPUT) return -1; |
| return result; |
| } |
| |
| /** |
| * Generate the implicit CE, from raw integer. |
| * Left shifted to put the first byte at the top of an int. |
| * @param cp code point |
| * @return Primary implicit weight |
| */ |
| public int getImplicitFromRaw(int cp) { |
| if (cp < 0 || cp > MAX_INPUT) { |
| throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp)); |
| } |
| int last0 = cp - min4Boundary; |
| if (last0 < 0) { |
| int last1 = cp / final3Count; |
| last0 = cp % final3Count; |
| |
| int last2 = last1 / medialCount; |
| last1 %= medialCount; |
| |
| last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start |
| last1 = minTrail + last1; // offset |
| last2 = min3Primary + last2; // offset |
| |
| if (last2 >= min4Primary) { |
| throw new IllegalArgumentException("4-byte out of range: " + |
| Utility.hex(cp) + ", " + Utility.hex(last2)); |
| } |
| |
| return (last2 << 24) + (last1 << 16) + (last0 << 8); |
| } else { |
| int last1 = last0 / final4Count; |
| last0 %= final4Count; |
| |
| int last2 = last1 / medialCount; |
| last1 %= medialCount; |
| |
| int last3 = last2 / medialCount; |
| last2 %= medialCount; |
| |
| last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start |
| last1 = minTrail + last1; // offset |
| last2 = minTrail + last2; // offset |
| last3 = min4Primary + last3; // offset |
| |
| if (last3 > max4Primary) { |
| throw new IllegalArgumentException("4-byte out of range: " + |
| Utility.hex(cp) + ", " + Utility.hex(last3)); |
| } |
| |
| return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0; |
| } |
| } |
| |
| /** |
| * Gets an Implicit from a code point. Internally, |
| * swaps (which produces a raw value 0..220000, |
| * then converts raw to implicit. |
| * @param cp The code point to convert to implicit. |
| * @return Primary implicit weight |
| */ |
| public int getImplicitFromCodePoint(int cp) { |
| if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); |
| |
| // Produce Raw value |
| // note, we add 1 so that the first value is always empty!! |
| cp = ImplicitCEGenerator.swapCJK(cp) + 1; |
| // we now have a range of numbers from 0 to 220000. |
| |
| if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); |
| |
| return getImplicitFromRaw(cp); |
| } |
| |
| /** |
| * Function used to: |
| * a) collapse the 2 different Han ranges from UCA into one (in the right order), and |
| * b) bump any non-CJK characters by 10FFFF. |
| * The relevant blocks are: |
| * A: 4E00..9FFF; CJK Unified Ideographs |
| * F900..FAFF; CJK Compatibility Ideographs |
| * B: 3400..4DBF; CJK Unified Ideographs Extension A |
| * 20000..XX; CJK Unified Ideographs Extension B (and others later on) |
| * As long as |
| * no new B characters are allocated between 4E00 and FAFF, and |
| * no new A characters are outside of this range, |
| * (very high probability) this simple code will work. |
| * The reordered blocks are: |
| * Block1 is CJK |
| * Block2 is CJK_COMPAT_USED |
| * Block3 is CJK_A |
| * (all contiguous) |
| * Any other CJK gets its normal code point |
| * Any non-CJK gets +10FFFF |
| * When we reorder Block1, we make sure that it is at the very start, |
| * so that it will use a 3-byte form. |
| * Warning: the we only pick up the compatibility characters that are |
| * NOT decomposed, so that block is smaller! |
| */ |
| |
| static int NON_CJK_OFFSET = 0x110000; |
| |
| static int swapCJK(int i) { |
| |
| if (i >= CJK_BASE) { |
| if (i < CJK_LIMIT) return i - CJK_BASE; |
| |
| if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET; |
| |
| if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE |
| + (CJK_LIMIT - CJK_BASE); |
| if (i < CJK_B_BASE) return i + NON_CJK_OFFSET; |
| |
| if (i < CJK_B_LIMIT) return i; // non-BMP-CJK |
| |
| return i + NON_CJK_OFFSET; // non-CJK |
| } |
| if (i < CJK_A_BASE) return i + NON_CJK_OFFSET; |
| |
| if (i < CJK_A_LIMIT) return i - CJK_A_BASE |
| + (CJK_LIMIT - CJK_BASE) |
| + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); |
| return i + NON_CJK_OFFSET; // non-CJK |
| } |
| |
| |
| /** |
| * @return Minimal trail value |
| */ |
| public int getMinTrail() { |
| return minTrail; |
| } |
| |
| /** |
| * @return Maximal trail value |
| */ |
| public int getMaxTrail() { |
| return maxTrail; |
| } |
| |
| public int getCodePointFromRaw(int i) { |
| i--; |
| int result = 0; |
| if(i >= NON_CJK_OFFSET) { |
| result = i - NON_CJK_OFFSET; |
| } else if(i >= CJK_B_BASE) { |
| result = i; |
| } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { |
| // rest of CJKs, compacted |
| if(i < CJK_LIMIT - CJK_BASE) { |
| result = i + CJK_BASE; |
| } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { |
| result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE); |
| } else { |
| result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); |
| } |
| } else { |
| result = -1; |
| } |
| return result; |
| } |
| |
| public int getRawFromCodePoint(int i) { |
| return swapCJK(i)+1; |
| } |
| } |