| /** |
| ******************************************************************************* |
| * Copyright (C) 1996-2001, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/CompactName.java,v $ |
| * $Date: 2001/08/31 00:30:17 $ |
| * $Revision: 1.2 $ |
| * |
| ******************************************************************************* |
| */ |
| |
| package com.ibm.text.UCD; |
| |
| import java.io.IOException; |
| import java.util.*; |
| import java.io.*; |
| import java.text.*; |
| |
| public class CompactName { |
| |
| static final boolean DEBUG = false; |
| |
| public static void main(String[] args) throws IOException { |
| |
| int test = tokenFromString("ABZ"); |
| String ss = stringFromToken(test); |
| System.out.println(ss); |
| |
| CompactName.addWord("ABSOLUTEISM"); |
| |
| for (int i = 0; i < CompactName.lastToken; ++i) { |
| String s = CompactName.stringFromToken(i); |
| System.out.println(s); |
| } |
| |
| } |
| |
| |
| static final char[] compactMap = new char[128]; |
| static final char[] compactUnmap = new char[128]; |
| |
| static { |
| char counter = 0; |
| compactMap[0] = counter++; |
| for (int i = 'A'; i <= 'Z'; ++i) { |
| compactMap[i] = counter++; |
| } |
| compactMap['-'] = counter++; |
| compactMap['>'] = counter++; |
| compactMap['<'] = counter++; |
| compactMap['*'] = counter++; |
| |
| compactUnmap[0] = 0; |
| for (char i = 0; i < compactUnmap.length; ++i) { |
| int x = compactMap[i]; |
| if (x != 0) compactUnmap[x] = i; |
| } |
| } |
| |
| /* |
| static String expand(String s) { |
| StringBuffer result = new StringBuffer(); |
| for (int i = 0; i < s.length(); ++i) { |
| int m = s.charAt(i); |
| if (m == 31 && i < s.length() + 1) { |
| m = 31 + s.charAt(++i); |
| } |
| result.append(compactUnmap[m]); |
| } |
| return result.toString(); |
| } |
| |
| static String compact(String s) { |
| StringBuffer result = new StringBuffer(); |
| for (int i = 0; i < s.length(); ++i) { |
| int m = compactMap[s.charAt(i)]; |
| if (m >= 31) { |
| result.append((char)31); |
| m -= 31; |
| } |
| result.append(m); |
| } |
| return result.toString(); |
| } |
| */ |
| |
| static Map string_token = new HashMap(); |
| static Map token_string = new HashMap(); |
| |
| static int[] tokenList = new int[40000]; |
| static final int tokenStart = 0; |
| static int lastToken = 0; |
| |
| static int spacedMinimum = Integer.MAX_VALUE; |
| |
| static boolean isLiteral(int i) { |
| return (i & 0x8000) != 0; |
| } |
| |
| static int addTokenForString(String s, int lead, int trail) { |
| Object in = string_token.get(s); |
| if (in != null) throw new IllegalArgumentException(); |
| int value = (lead << 16) + (trail & 0xFFFF); |
| int result = lastToken; |
| tokenList[lastToken++] = value; |
| |
| if (DEBUG) { |
| System.out.println("'" + s + "', tokenList[" + result + "] = lead: " + lead + ", trail: " + trail); |
| String roundTrip = stringFromToken(result); |
| if (!roundTrip.equals(s)) { |
| System.out.println("\t*** No Round Trip: '" + roundTrip + "'"); |
| } |
| } |
| string_token.put(s, new Integer(result)); |
| return result; |
| } |
| |
| static String stringFromToken(int i) { |
| String result; |
| if ((i & 0x8000) != 0) { |
| char first = compactUnmap[(i >> 10) & 0x1F]; |
| char second = compactUnmap[(i >> 5) & 0x1F]; |
| char third = compactUnmap[i & 0x1F]; |
| result = String.valueOf(first); |
| if (second != 0) result += String.valueOf(second); |
| if (third != 0) result += String.valueOf(third); |
| } else if (i > lastToken) { |
| throw new IllegalArgumentException("bad token: " + i); |
| } else { |
| int value = tokenList[i]; |
| int lead = value >>> 16; |
| int trail = value & 0xFFFF; |
| if (i >= spacedMinimum) result = stringFromToken(lead) + ' ' + stringFromToken(trail); |
| else result = stringFromToken(lead) + stringFromToken(trail); |
| } |
| if (DEBUG) System.out.println("token: " + i + " => '" + result + "'"); |
| return result; |
| } |
| |
| static int tokenFromString(String s) { |
| if (s.length() <= 3) { |
| int first = compactMap[s.charAt(0)]; |
| int second = compactMap[s.length() > 1 ? s.charAt(1) : 0]; |
| int third = compactMap[s.length() > 2 ? s.charAt(2) : 0]; |
| return 0x8000 + (first << 10) + (second << 5) + third; |
| } |
| Object in = string_token.get(s); |
| if (in == null) return -1; |
| return ((Integer)in).intValue(); |
| } |
| |
| |
| static int addWord(String s) { |
| |
| int result = tokenFromString(s); |
| if (result != -1) return result; |
| int bestLen = 0; |
| int best_i = 0; |
| |
| int limit = s.length() - 1; |
| |
| for (int i = limit; i >= 1; --i) { |
| |
| String firstPart = s.substring(0, i); |
| String lastPart = s.substring(i); |
| |
| int lead = tokenFromString(firstPart); |
| int trail = tokenFromString(lastPart); |
| |
| if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair |
| if (DEBUG) show(s, firstPart, lastPart, "MATCH BOTH"); |
| return addTokenForString(s, lead, trail); |
| } |
| if (!isLiteral(lead)) { |
| if (i > bestLen) { |
| bestLen = i; |
| best_i = i; |
| } |
| } |
| if (!isLiteral(trail)) { |
| int end_i = s.length() - i; |
| if (end_i > bestLen) { |
| bestLen = end_i; |
| best_i = i; |
| } |
| } |
| } |
| if (bestLen > 0) { // if one matches, recurse -- and return pair |
| String firstPart = s.substring(0, best_i); |
| String lastPart = s.substring(best_i); |
| int lead = tokenFromString(firstPart); |
| int trail = tokenFromString(lastPart); |
| if (lead >= 0) { |
| if (DEBUG) show(s, firstPart, lastPart, "MATCH FIRST"); |
| return addTokenForString(s, lead, addWord(lastPart)); |
| } else { |
| if (DEBUG) show(s, firstPart, lastPart, "MATCH SECOND"); |
| return addTokenForString(s, addWord(firstPart), trail); |
| } |
| } |
| |
| // break at multiple of 3 |
| |
| best_i = ((s.length() + 1) / 6) * 3; |
| String firstPart = s.substring(0, best_i); |
| String lastPart = s.substring(best_i); |
| if (DEBUG) show(s, firstPart, lastPart, "Fallback"); |
| return addTokenForString(s, addWord(firstPart), addWord(lastPart)); |
| } |
| |
| static void show(String s, String firstPart, String lastPart, String comment) { |
| System.out.println((s) + " => '" + (firstPart) |
| + "' # '" + (lastPart) + "' " + comment); |
| } |
| |
| static void startLines() { |
| spacedMinimum = lastToken; |
| } |
| |
| static int addLine(String s) { |
| |
| int result = tokenFromString(s); |
| if (result != -1) return result; |
| int bestLen = 0; |
| int best_i = 0; |
| |
| int limit = s.length() - 2; |
| |
| for (int i = limit; i >= 1; --i) { |
| char c = s.charAt(i); |
| if (c != ' ') continue; |
| |
| String firstPart = s.substring(0, i); |
| String lastPart = s.substring(i+1); |
| |
| int lead = tokenFromString(firstPart); |
| int trail = tokenFromString(lastPart); |
| |
| if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair |
| if (DEBUG) show(s, firstPart, lastPart, "MATCH BOTH"); |
| return addTokenForString(s, lead, trail); |
| } |
| if (i > bestLen) { |
| bestLen = i; |
| best_i = i; |
| } |
| |
| int end_i = s.length() - i - 1; |
| if (end_i > bestLen) { |
| bestLen = end_i; |
| best_i = i; |
| } |
| } |
| if (bestLen > 0) { // if one matches, recurse -- and return pair |
| String firstPart = s.substring(0, best_i); |
| String lastPart = s.substring(best_i + 1); |
| int lead = tokenFromString(firstPart); |
| int trail = tokenFromString(lastPart); |
| if (lead >= 0) { |
| if (DEBUG) show(s, firstPart, lastPart, "MATCH FIRST"); |
| return addTokenForString(s, lead, addLine(lastPart)); |
| } else { |
| if (DEBUG) show(s, firstPart, lastPart, "MATCH SECOND"); |
| return addTokenForString(s, addLine(firstPart), trail); |
| } |
| } |
| |
| System.out.println("SHOULD HAVE MATCHED!!"); |
| throw new IllegalArgumentException("SHOULD HAVE MATCHED!! " + s); |
| } |
| } |