| /** |
| ******************************************************************************* |
| * Copyright (C) 2001-2010, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| package com.ibm.icu.dev.demo.translit; |
| import java.util.Enumeration; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.Set; |
| import java.util.TreeSet; |
| |
| import com.ibm.icu.lang.UScript; |
| import com.ibm.icu.text.Replaceable; |
| import com.ibm.icu.text.Transliterator; |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.text.UnicodeFilter; |
| |
| public class AnyTransliterator extends Transliterator { |
| |
| static final boolean DEBUG = false; |
| private String targetName; |
| private RunIterator it; |
| private Position run; |
| |
| |
| public AnyTransliterator(String targetName, UnicodeFilter filter, RunIterator it){ |
| super("Any-" + targetName, filter); |
| this.targetName = targetName; |
| this.it = it; |
| run = new Position(); |
| } |
| |
| public AnyTransliterator(String targetName, UnicodeFilter filter){ |
| this(targetName, filter, new ScriptRunIterator()); |
| } |
| |
| static private Transliterator hex = Transliterator.getInstance("[^\\u0020-\\u007E] hex"); |
| |
| protected void handleTransliterate(Replaceable text, |
| Position offsets, boolean isIncremental) { |
| if (DEBUG) { |
| System.out.println("- handleTransliterate " + hex.transliterate(text.toString()) |
| + ", " + toString(offsets)); |
| } |
| it.reset(text, offsets); |
| |
| while (it.next(run)) { |
| if (targetName.equalsIgnoreCase(it.getName())) { |
| if (DEBUG) System.out.println("Skipping identical: " + targetName); |
| run.start = run.limit; // show we processed |
| continue; // skip if same |
| } |
| |
| Transliterator t; |
| String id = it.getName() + '-' + targetName; |
| try { |
| t = Transliterator.getInstance(id); |
| } catch (IllegalArgumentException ex) { |
| if (DEBUG) System.out.println("Couldn't find: " + id + ", Trying Latin as Pivot"); |
| id = it.getName() + "-Latin; Latin-" + targetName; |
| try { |
| t = Transliterator.getInstance(id); |
| } catch (IllegalArgumentException ex2) { |
| if (DEBUG) System.out.println("Couldn't find: " + id); |
| continue; |
| } |
| } |
| // TODO catch error later!! |
| |
| if (DEBUG) { |
| System.out.println(t.getID()); |
| System.out.println("input: " + hex.transliterate(text.toString()) |
| + ", " + toString(run)); |
| } |
| |
| if (isIncremental && it.atEnd()) { |
| t.transliterate(text, run); |
| } else { |
| t.finishTransliteration(text, run); |
| } |
| // adjust the offsets in line with the changes |
| it.adjust(run.limit); |
| |
| if (DEBUG) { |
| System.out.println("output: " + hex.transliterate(text.toString()) |
| + ", " + toString(run)); |
| } |
| } |
| |
| // show how far we got! |
| it.getExpanse(offsets); |
| if (run.start == run.limit) offsets.start = offsets.limit; |
| else offsets.start = run.start; |
| if (DEBUG) { |
| System.out.println("+ handleTransliterate: " + ", " + toString(offsets)); |
| System.out.println(); |
| } |
| } |
| |
| // should be method on Position |
| public static String toString(Position offsets) { |
| return "[cs: " + offsets.contextStart |
| + ", s: " + offsets.start |
| + ", l: " + offsets.limit |
| + ", cl: " + offsets.contextLimit |
| + "]"; |
| } |
| |
| public interface RunIterator { |
| public void reset(Replaceable text, Position expanse); |
| public void getExpanse(Position run); |
| public void reset(); |
| public boolean next(Position run); |
| public void getCurrent(Position run); |
| public String getName(); |
| public void adjust(int newCurrentLimit); |
| public boolean atEnd(); |
| } |
| |
| /** |
| * Returns a series of ranges corresponding to scripts. They will be of the form: |
| * ccccSScSSccccTTcTcccc - where c is common, S is the first script and T is the second |
| *| | - first run |
| * | | - second run |
| * That is, the runs will overlap. The reason for this is so that a transliterator can |
| * consider common characters both before and after the scripts. |
| * The only time that contextStart != start is for the first run |
| * (the context is the start context of the entire expanse) |
| * The only time that contextLimit != limit is for the last run |
| * (the context is the end context of the entire expanse) |
| */ |
| public static class ScriptRunIterator implements RunIterator { |
| private Replaceable text; |
| private Position expanse = new Position(); |
| private Position current = new Position(); |
| private int script; |
| private boolean done = true; |
| |
| |
| public void reset(Replaceable repText, Position expansePos) { |
| set(this.expanse, expansePos); |
| this.text = repText; |
| reset(); |
| } |
| |
| public void reset() { |
| done = false; |
| //this.expanse = expanse; |
| script = UScript.INVALID_CODE; |
| // set up first range to be empty, at beginning |
| current.contextStart = expanse.contextStart; |
| current.start = current.limit = current.contextLimit = expanse.start; |
| } |
| |
| public boolean next(Position run) { |
| if (done) return false; |
| if (DEBUG) { |
| System.out.println("+cs: " + current.contextStart |
| + ", s: " + current.start |
| + ", l: " + current.limit |
| + ", cl: " + current.contextLimit); |
| } |
| // reset start context run to the last end |
| current.start = current.limit; |
| |
| // Phase 1. Backup the START value through COMMON until we get to expanse.start or a real script. |
| int i, cp; |
| int limit = expanse.start; |
| for (i = current.start; i > limit; i -= UTF16.getCharCount(cp)) { |
| cp = text.char32At(i); |
| int scrpt = UScript.getScript(cp); |
| if (scrpt != UScript.COMMON && scrpt != UScript.INHERITED) break; |
| } |
| current.start = i; |
| current.contextStart = (i == limit) ? expanse.contextStart : i; // extend at start |
| |
| // PHASE 2. Move up the LIMIT value through COMMON or single script until we get to expanse.limit |
| int lastScript = UScript.COMMON; |
| //int veryLastScript = UScript.COMMON; |
| limit = expanse.limit; |
| for (i = current.limit; i < limit; i += UTF16.getCharCount(cp)) { |
| cp = text.char32At(i); |
| int scrpt = UScript.getScript(cp); |
| if (scrpt == UScript.INHERITED) scrpt = UScript.COMMON; |
| if (scrpt != UScript.COMMON) { |
| // if we find a real script: |
| // if we already had a script, bail |
| // otherwise set our script |
| if (lastScript == UScript.COMMON) lastScript = scrpt; |
| else if (lastScript != scrpt) break; |
| } |
| } |
| current.limit = i; |
| current.contextLimit = (i == limit) ? expanse.contextLimit : i; // extend at end |
| done = (i == limit); |
| script = lastScript; |
| |
| if (DEBUG) { |
| System.out.println("-cs: " + current.contextStart |
| + ", s: " + current.start |
| + ", l: " + current.limit |
| + ", cl: " + current.contextLimit); |
| } |
| |
| set(run, current); |
| return true; |
| } |
| |
| // SHOULD BE METHOD ON POSITION |
| public static void set(Position run, Position current) { |
| run.contextStart = current.contextStart; |
| run.start = current.start; |
| run.limit = current.limit; |
| run.contextLimit = current.contextLimit; |
| } |
| |
| public boolean atEnd() { |
| return current.limit == expanse.limit; |
| } |
| |
| public void getCurrent(Position run) { |
| set(run, current); |
| } |
| |
| public void getExpanse(Position run) { |
| set(run, expanse); |
| } |
| |
| public String getName() { |
| return UScript.getName(script); |
| } |
| |
| public void adjust(int newCurrentLimit) { |
| if (expanse == null) { |
| throw new IllegalArgumentException("Must reset() before calling"); |
| } |
| int delta = newCurrentLimit - current.limit; |
| current.limit += delta; |
| current.contextLimit += delta; |
| expanse.limit += delta; |
| expanse.contextLimit += delta; |
| } |
| |
| // register Any-Script for every script. |
| |
| private static Set scriptList = new HashSet(); |
| |
| public static void registerAnyToScript() { |
| synchronized (scriptList) { |
| Enumeration sources = Transliterator.getAvailableSources(); |
| while(sources.hasMoreElements()) { |
| String source = (String) sources.nextElement(); |
| if (source.equals("Any")) continue; // to keep from looping |
| |
| Enumeration targets = Transliterator.getAvailableTargets(source); |
| while(targets.hasMoreElements()) { |
| String target = (String) targets.nextElement(); |
| if (UScript.getCode(target) == null) continue; // SKIP unless we have a script (or locale) |
| if (scriptList.contains(target)) continue; // already encountered |
| scriptList.add(target); // otherwise add for later testing |
| |
| Set variantSet = add(new TreeSet(), Transliterator.getAvailableVariants(source, target)); |
| if (variantSet.size() < 2) { |
| AnyTransliterator at = new AnyTransliterator(target, null); |
| DummyFactory.add(at.getID(), at); |
| } else { |
| Iterator variants = variantSet.iterator(); |
| while(variants.hasNext()) { |
| String variant = (String) variants.next(); |
| AnyTransliterator at = new AnyTransliterator( |
| (variant.length() > 0) ? target + "/" + variant : target, null); |
| DummyFactory.add(at.getID(), at); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| static class DummyFactory implements Transliterator.Factory { |
| static DummyFactory singleton = new DummyFactory(); |
| static HashMap m = new HashMap(); |
| |
| // Since Transliterators are immutable, we don't have to clone on set & get |
| static void add(String ID, Transliterator t) { |
| m.put(ID, t); |
| System.out.println("Registering: " + ID + ", " + t.toRules(true)); |
| Transliterator.registerFactory(ID, singleton); |
| } |
| public Transliterator getInstance(String ID) { |
| return (Transliterator) m.get(ID); |
| } |
| } |
| |
| // Nice little Utility for converting Enumeration to collection |
| static Set add(Set s, Enumeration enumeration) { |
| while(enumeration.hasMoreElements()) { |
| s.add(enumeration.nextElement()); |
| } |
| return s; |
| } |
| |
| |
| } |
| } |