tools/misc/src/com/ibm/icu/dev/tool/layout/CanonGSUBBuilder.java - external/github.com/unicode-org/icu - Git at Google

 /**
  *******************************************************************************
  * Copyright (C) 2002-2010, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  *******************************************************************************
  */


 package com.ibm.icu.dev.tool.layout;

 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UScript;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;

 /**
  * @author Eric Mader
  *
  * Notes:
  *
  * The property \p{Decomposition_Type=Canonical} will match all characters with a canonical
  * decomposition.
  *
  * So "[[\\p{Latin}\\p{Greek}\\p{Cyrillic}] & [\\p{Decomposition_Type=Canonical}]]"
  * will match all Latin, Greek and Cyrillic characters with a canonical decomposition.
  *
  * Are these three scripts enough? Do we want to collect them all at once and distribute by script,
  * or process them one script at a time. It's probably a good idea to build a single table for
  * however many scripts there are.
  *
  * It might be better to collect all the characters that have a canonical decomposition and just
  * sort them into however many scripts there are... unless we'll get characters in COMMON???
  */
 public class CanonGSUBBuilder
 {
     static public String convertArabicString(int type, int ligature, String decomp, ClassTable isolClassTable)
     {
         int leftType  = ArabicShaping.VALUE_NONE;
         int rightType = ArabicShaping.VALUE_NONE;

         switch (type) {
             case UCharacter.DecompositionType.ISOLATED:
                 break;

             case UCharacter.DecompositionType.FINAL:
                 rightType = ArabicShaping.VALUE_LEFT;
                 break;

             case UCharacter.DecompositionType.INITIAL:
                 leftType = ArabicShaping.VALUE_RIGHT;
                 break;

             case UCharacter.DecompositionType.MEDIAL:
                rightType = ArabicShaping.VALUE_LEFT;
                leftType  = ArabicShaping.VALUE_RIGHT;
                break;

            default:
                return decomp + UCharacter.toString(ligature);
         }

         char[] chars = decomp.toCharArray();

         ArabicShaping.shape(chars, leftType, rightType, isolClassTable);

         return new String(chars) + UCharacter.toString(ligature);
     }

     static void buildArabicContextualForms(ArabicCharacterData data, ClassTable initClassTable, ClassTable mediClassTable,
                                      ClassTable finaClassTable, ClassTable isolClassTable)
     {
         System.out.print("Finding Arabic contextual forms... ");

         for (int i = 0; i < data.countRecords(); i += 1) {
             ArabicCharacterData.Record record = data.getRecord(i);
             String decomposition = record.getDecomposition();

             if (decomposition != null && decomposition.length() == 1) {
                 int contextual = record.getCodePoint();
                 int isolated   = UTF16.charAt(record.getDecomposition(), 0);

                 switch (record.getDecompositionType()) {
                 case UCharacter.DecompositionType.INITIAL:
                     initClassTable.addMapping(isolated, contextual);
                     break;

                 case UCharacter.DecompositionType.MEDIAL:
                     mediClassTable.addMapping(isolated, contextual);
                     break;

                case UCharacter.DecompositionType.FINAL:
                    finaClassTable.addMapping(isolated, contextual);
                    break;

                case UCharacter.DecompositionType.ISOLATED:
                    isolClassTable.addMapping(isolated, contextual);
                    break;

                default:
                    // issue some error message?
                    break;
                 }
             }
         }

         System.out.println("Done.");
     }

     static LigatureTree buildArabicLigatureTree(ArabicCharacterData data, ClassTable isolClassTable)
     {
         LigatureTree contextualTree = new LigatureTree();
         int ligatureCount = 0;

         System.out.print("Building Arabic ligature tree... ");

         for (int i = 0; i < data.countRecords(); i += 1) {
             ArabicCharacterData.Record record = data.getRecord(i);
             String decomposition = record.getDecomposition();

             if (decomposition != null && decomposition.length() > 1) {
                 int ligature   = record.getCodePoint();
                 int decompType = record.getDecompositionType();

                 switch (decompType) {
                 case UCharacter.DecompositionType.FINAL:
                 case UCharacter.DecompositionType.INITIAL:
                 case UCharacter.DecompositionType.MEDIAL:
                 case UCharacter.DecompositionType.ISOLATED:
                     contextualTree.insert(convertArabicString(decompType, ligature, decomposition, isolClassTable));
                     ligatureCount += 1;
                     break;

                 case UCharacter.DecompositionType.CANONICAL:
                     //cannonicalTree.insert(decomposition + UCharacter.toString(ligature));
                     break;
                 }
             }
         }

         System.out.println(ligatureCount + " ligatures.");

         return contextualTree;
     }

     static final int SIMPLE_GLYPH = 1;
     static final int LIGATURE_GLYPH = 2;
     static final int MARK_GLYPH = 3;
     static final int COMPONENT_GLYPH = 4;

     static final int categoryClassMap[] = {
     0,              // UNASSIGNED
     SIMPLE_GLYPH,   // UPPERCASE_LETTER
     SIMPLE_GLYPH,   // LOWERCASE_LETTER
     SIMPLE_GLYPH,   // TITLECASE_LETTER
     SIMPLE_GLYPH,   // MODIFIER_LETTER
     SIMPLE_GLYPH,   // OTHER_LETTER
     MARK_GLYPH,     // NON_SPACING_MARK
     MARK_GLYPH,     // ENCLOSING_MARK ??
     MARK_GLYPH,     // COMBINING_SPACING_MARK ??
     SIMPLE_GLYPH,   // DECIMAL_NUMBER
     SIMPLE_GLYPH,   // LETTER_NUMBER
     SIMPLE_GLYPH,   // OTHER_NUMBER;
     0,              // SPACE_SEPARATOR
     0,              // LINE_SEPARATOR
     0,              // PARAGRAPH_SEPARATOR
     0,              // CONTROL
     0,              // FORMAT
     0,              // PRIVATE_USE
     0,              // SURROGATE
     SIMPLE_GLYPH,   // DASH_PUNCTUATION
     SIMPLE_GLYPH,   // START_PUNCTUATION
     SIMPLE_GLYPH,   // END_PUNCTUATION
     SIMPLE_GLYPH,   // CONNECTOR_PUNCTUATION
     SIMPLE_GLYPH,   // OTHER_PUNCTUATION
     SIMPLE_GLYPH,   // MATH_SYMBOL;
     SIMPLE_GLYPH,   // CURRENCY_SYMBOL
     SIMPLE_GLYPH,   // MODIFIER_SYMBOL
     SIMPLE_GLYPH,   // OTHER_SYMBOL
     SIMPLE_GLYPH,   // INITIAL_PUNCTUATION
     SIMPLE_GLYPH    // FINAL_PUNCTUATION
     };

     static int getGlyphClass(ArabicCharacterData.Record record)
     {
         String decomp = record.getDecomposition();

         if (decomp != null && decomp.length() > 1) {
             return LIGATURE_GLYPH;
         }

         return categoryClassMap[record.getGeneralCategory()];
     }

     static void addArabicGlyphClasses(ArabicCharacterData data, ClassTable classTable)
     {
         System.out.print("Adding Arabic glyph classes... ");

         for (int i = 0; i < data.countRecords(); i += 1) {
             ArabicCharacterData.Record record = data.getRecord(i);
             classTable.addMapping(record.getCodePoint(), getGlyphClass(record));
         }

         System.out.println("Done.");
     }

     private static void buildArabicTables(ScriptList scriptList, FeatureList featureList,
                                                 LookupList lookupList, ClassTable classTable) {
         // TODO: Might want to have the ligature table builder explicitly check for ligatures
         // which start with space and tatweel rather than pulling them out here...
         UnicodeSet arabicBlock   = new UnicodeSet("[[\\p{block=Arabic}] & [[:Cf:][:Po:][:So:][:Mn:][:Nd:][:Lm:]]]");
         UnicodeSet oddLigatures  = new UnicodeSet("[\\uFC5E-\\uFC63\\uFCF2-\\uFCF4\\uFE70-\\uFE7F]");
         UnicodeSet arabicLetters = new UnicodeSet("[\\p{Arabic}]");
         ArabicCharacterData arabicData = ArabicCharacterData.factory(arabicLetters.addAll(arabicBlock).removeAll(oddLigatures));

         addArabicGlyphClasses(arabicData, classTable);

         ClassTable initClassTable = new ClassTable();
         ClassTable mediClassTable = new ClassTable();
         ClassTable finaClassTable = new ClassTable();
         ClassTable isolClassTable = new ClassTable();

         buildArabicContextualForms(arabicData, initClassTable, mediClassTable, finaClassTable, isolClassTable);
         isolClassTable.snapshot();
         LigatureTree ligaTree = buildArabicLigatureTree(arabicData, isolClassTable);

         LigatureTreeWalker ligaWalker = new LigatureTreeWalker();

         ligaTree.walk(ligaWalker);

         Lookup initLookup, mediLookup, finaLookup, ligaLookup;

         initLookup = new Lookup(Lookup.GSST_Single, 0);
         initLookup.addSubtable(initClassTable);

         mediLookup = new Lookup(Lookup.GSST_Single, 0);
         mediLookup.addSubtable(mediClassTable);

         finaLookup = new Lookup(Lookup.GSST_Single, 0);
         finaLookup.addSubtable(finaClassTable);

         ligaLookup = new Lookup(Lookup.GSST_Ligature, Lookup.LF_IgnoreMarks);
         ligaLookup.addSubtable(ligaWalker);

         Feature init = new Feature("init");
         Feature medi = new Feature("medi");
         Feature fina = new Feature("fina");
         Feature liga = new Feature("liga");

         init.addLookup(lookupList.addLookup(initLookup));
         medi.addLookup(lookupList.addLookup(mediLookup));
         fina.addLookup(lookupList.addLookup(finaLookup));
         liga.addLookup(lookupList.addLookup(ligaLookup));

         featureList.addFeature(init);
         featureList.addFeature(medi);
         featureList.addFeature(fina);
         featureList.addFeature(liga);

         scriptList.addFeature("arab", "(default)", init);
         scriptList.addFeature("arab", "(default)", medi);
         scriptList.addFeature("arab", "(default)", fina);
         scriptList.addFeature("arab", "(default)", liga);

         System.out.println();
     }

     public static void buildLigatureTree(CanonicalCharacterData data, int script, LigatureTree ligatureTree)
     {
         int ligatureCount = 0;

         System.out.print("building composition ligature tree for " + UScript.getName(script) + "... ");

         for (int i = 0; i < data.countRecords(script); i += 1) {
             CanonicalCharacterData.Record record = data.getRecord(script, i);
             String composed = UCharacter.toString(record.getComposedCharacter());

             for (int e = 0; e < record.countEquivalents(); e += 1) {
                 String equivalent = record.getEquivalent(e);

                 ligatureTree.insert(equivalent + composed);
                 ligatureCount += 1;
             }
         }

         System.out.println(ligatureCount + " ligatures.");
     }

     public static DecompTable[] buildDecompTables(CanonicalCharacterData data, int script)
     {
         int maxDecompCount = data.getMaxEquivalents(script);
         DecompTable[] decompTables = new DecompTable[maxDecompCount];

         System.out.print("Building decompositon tables for " + UScript.getName(script) +
                          "... total decompositions: " + data.countRecords(script) +
                          ", max: " + maxDecompCount + "...");

         for (int i = 0; i < maxDecompCount; i += 1) {
             DecompTable table = new DecompTable();

             for (int r = 0; r < data.countRecords(script); r += 1) {
                 CanonicalCharacterData.Record record = data.getRecord(script, r);

                 if (record.countEquivalents() > i) {
                     table.add(record.getComposedCharacter(), record.getEquivalent(i));
                 }
             }

             decompTables[i] = table;
         }

         System.out.println(" Done.");

         return decompTables;
     }

     public static int[] buildLookups(CanonicalCharacterData data, LookupList lookupList, int script)
     {
         int[] lookups = new int[2];

         DecompTable[] decompTables = buildDecompTables(data, script);

         LigatureTree compTree = new LigatureTree();

         buildLigatureTree(data, script, compTree);

         System.out.println();

         LigatureTreeWalker compWalker = new LigatureTreeWalker();

         compTree.walk(compWalker);

         Lookup compLookup, dcmpLookup;
         //int compLookupIndex, dcmpLookupIndex;

         compLookup = new Lookup(Lookup.GSST_Ligature, 0);
         compLookup.addSubtable(compWalker);

         dcmpLookup = new Lookup(Lookup.GSST_Multiple, 0);
         for (int i = 0; i < decompTables.length; i += 1) {
             dcmpLookup.addSubtable(decompTables[i]);
         }

         lookups[0] = lookupList.addLookup(compLookup);
         lookups[1] = lookupList.addLookup(dcmpLookup);

         return lookups;
     }

     public static void addLookups(Feature feature, int[] lookups)
     {
         for (int i = 0; i < lookups.length; i += 1) {
             feature.addLookup(lookups[i]);
         }
     }

     /*
      * Hebrew mark order taken from the SBL Hebrew Font manual
      * Arabic mark order per Thomas Milo: hamza < shadda < combining_alef < sukun, vowel_marks < madda < qur'anic_marks
      */
     public static ClassTable buildCombiningClassTable()
     {
         UnicodeSet markSet = new UnicodeSet("[\\P{CanonicalCombiningClass=0}]");
         ClassTable exceptions = new ClassTable();
         ClassTable combiningClasses = new ClassTable();
         int markCount = markSet.size();

         exceptions.addMapping(0x05C1,  10); // Point Shin Dot
         exceptions.addMapping(0x05C2,  11); // Point Sin Dot
         exceptions.addMapping(0x05BC,  21); // Point Dagesh or Mapiq
         exceptions.addMapping(0x05BF,  23); // Point Rafe
         exceptions.addMapping(0x05B9,  27); // Point Holam
         exceptions.addMapping(0x0323, 220); // Comb. Dot Below (low punctum)
         exceptions.addMapping(0x0591, 220); // Accent Etnahta
         exceptions.addMapping(0x0596, 220); // Accent Tipeha
         exceptions.addMapping(0x059B, 220); // Accent Tevir
         exceptions.addMapping(0x05A3, 220); // Accent Munah
         exceptions.addMapping(0x05A4, 220); // Accent Mahapakh
         exceptions.addMapping(0x05A5, 220); // Accent Merkha
         exceptions.addMapping(0x05A6, 220); // Accent Merkha Kefula
         exceptions.addMapping(0x05A7, 220); // Accent Darga
         exceptions.addMapping(0x05AA, 220); // Accent Yerah Ben Yomo
         exceptions.addMapping(0x05B0, 220); // Point Sheva
         exceptions.addMapping(0x05B1, 220); // Point Hataf Segol
         exceptions.addMapping(0x05B2, 220); // Point Hataf Patah
         exceptions.addMapping(0x05B3, 220); // Point Hataf Qamats
         exceptions.addMapping(0x05B4, 220); // Point Hiriq
         exceptions.addMapping(0x05B5, 220); // Point Tsere
         exceptions.addMapping(0x05B6, 220); // Point Segol
         exceptions.addMapping(0x05B7, 220); // Point Patah
         exceptions.addMapping(0x05B8, 220); // Point Qamats
         exceptions.addMapping(0x05BB, 220); // Point Qubuts
         exceptions.addMapping(0x05BD, 220); // Point Meteg
         exceptions.addMapping(0x059A, 222); // Accent Yetiv
         exceptions.addMapping(0x05AD, 222); // Accent Dehi
         exceptions.addMapping(0x05C4, 230); // Mark Upper Dot (high punctum)
         exceptions.addMapping(0x0593, 230); // Accent Shalshelet
         exceptions.addMapping(0x0594, 230); // Accent Zaqef Qatan
         exceptions.addMapping(0x0595, 230); // Accent Zaqef Gadol
         exceptions.addMapping(0x0597, 230); // Accent Revia
         exceptions.addMapping(0x0598, 230); // Accent Zarqa
         exceptions.addMapping(0x059F, 230); // Accent Qarney Para
         exceptions.addMapping(0x059E, 230); // Accent Gershayim
         exceptions.addMapping(0x059D, 230); // Accent Geresh Muqdam
         exceptions.addMapping(0x059C, 230); // Accent Geresh
         exceptions.addMapping(0x0592, 230); // Accent Segolta
         exceptions.addMapping(0x05A0, 230); // Accent Telisha Gedola
         exceptions.addMapping(0x05AC, 230); // Accent Iluy
         exceptions.addMapping(0x05A8, 230); // Accent Qadma
         exceptions.addMapping(0x05AB, 230); // Accent Ole
         exceptions.addMapping(0x05AF, 230); // Mark Masora Circle
         exceptions.addMapping(0x05A1, 230); // Accent Pazer
       //exceptions.addMapping(0x0307, 230); // Mark Number/Masora Dot
         exceptions.addMapping(0x05AE, 232); // Accent Zinor
         exceptions.addMapping(0x05A9, 232); // Accent Telisha Qetana
         exceptions.addMapping(0x0599, 232); // Accent Pashta

         exceptions.addMapping(0x0655,  27); // ARABIC HAMZA BELOW
         exceptions.addMapping(0x0654,  27); // ARABIC HAMZA ABOVE

         exceptions.addMapping(0x0651,  28); // ARABIC SHADDA

         exceptions.addMapping(0x0656,  29); // ARABIC SUBSCRIPT ALEF
         exceptions.addMapping(0x0670,  29); // ARABIC LETTER SUPERSCRIPT ALEF

         exceptions.addMapping(0x064D,  30); // ARABIC KASRATAN
         exceptions.addMapping(0x0650,  30); // ARABIC KASRA

         exceptions.addMapping(0x0652,  31); // ARABIC SUKUN
         exceptions.addMapping(0x06E1,  31); // ARABIC SMALL HIGH DOTLESS HEAD OF KHAH

         exceptions.addMapping(0x064B,  31); // ARABIC FATHATAN
         exceptions.addMapping(0x064C,  31); // ARABIC DAMMATAN
         exceptions.addMapping(0x064E,  31); // ARABIC FATHA
         exceptions.addMapping(0x064F,  31); // ARABIC DAMMA
         exceptions.addMapping(0x0657,  31); // ARABIC INVERTED DAMMA
         exceptions.addMapping(0x0658,  31); // ARABIC MARK NOON GHUNNA

         exceptions.addMapping(0x0653,  32); // ARABIC MADDAH ABOVE

         exceptions.snapshot();

         for (int i = 0; i < markCount; i += 1) {
             int mark = markSet.charAt(i);
             int markClass = exceptions.getGlyphClassID(mark);

             if (markClass == 0) {
                 markClass = UCharacter.getCombiningClass(mark);
             }

             combiningClasses.addMapping(mark, markClass);
         }

         combiningClasses.snapshot();
         return combiningClasses;
     }

     public static void buildDecompTables(String fileName)
     {
         // F900 - FAFF are compatibility ideographs. They all decompose to a single other character, and can be ignored.
       //UnicodeSet decompSet = new UnicodeSet("[[[\\P{Hangul}] & [\\p{DecompositionType=Canonical}]] - [\uF900-\uFAFF]]");
         UnicodeSet decompSet = new UnicodeSet("[[\\p{DecompositionType=Canonical}] & [\\P{FullCompositionExclusion}] & [\\P{Hangul}]]");
         CanonicalCharacterData data = CanonicalCharacterData.factory(decompSet);
         ClassTable classTable = new ClassTable();

         LookupList  lookupList  = new LookupList();
         FeatureList featureList = new FeatureList();
         ScriptList  scriptList  = new ScriptList();

         // build common, inherited lookups...
 //        int[] commonLookups = buildLookups(data, lookupList, UScript.COMMON);
 //        int[] inheritedLookups = buildLookups(data, lookupList, UScript.INHERITED);

         for (int script = 0; script < UScript.CODE_LIMIT; script += 1) {

             // This is a bit lame, but it's the only way I can think of
             // to make this work w/o knowing the values of COMMON and INHERITED...
             if (script == UScript.COMMON || script == UScript.INHERITED ||
                 data.getMaxEquivalents(script) == 0) {
                 continue;
             }

             int[] lookups = buildLookups(data, lookupList, script);

             Feature ccmp = new Feature("ccmp");

             addLookups(ccmp, lookups);
 //            addLookups(ccmp, commonLookups);
 //            addLookups(ccmp, inheritedLookups);

             featureList.addFeature(ccmp);

             String scriptTag = TagUtilities.tagLabel(UScript.getShortName(script));

             scriptList.addFeature(scriptTag, "(default)", ccmp);

             if (script == UScript.ARABIC) {
                 buildArabicTables(scriptList, featureList, lookupList, classTable);
             }
         }

         featureList.finalizeFeatureList();

         ClassTable markClassTable = buildCombiningClassTable();

         GSUBWriter gsubWriter = new GSUBWriter("Canon", scriptList, featureList, lookupList);
         GDEFWriter gdefWriter = new GDEFWriter("Canon", classTable, markClassTable);
         String[] includeFiles = {"LETypes.h", "CanonShaping.h"};

         LigatureModuleWriter writer = new LigatureModuleWriter();

         writer.openFile(fileName);
         writer.writeHeader(null, includeFiles);
         writer.writeTable(gsubWriter);
         writer.writeTable(gdefWriter);
         writer.writeTrailer();
         writer.closeFile();
     }

     public static void main(String[] args)
     {
         buildDecompTables(args[0]);
     }
 }
	/**
	*******************************************************************************
	* Copyright (C) 2002-2010, International Business Machines Corporation and *
	* others. All Rights Reserved. *
	*******************************************************************************
	*/


	package com.ibm.icu.dev.tool.layout;

	import com.ibm.icu.lang.UCharacter;
	import com.ibm.icu.lang.UScript;
	import com.ibm.icu.text.UTF16;
	import com.ibm.icu.text.UnicodeSet;

	/**
	* @author Eric Mader
	*
	* Notes:
	*
	* The property \p{Decomposition_Type=Canonical} will match all characters with a canonical
	* decomposition.
	*
	* So "[[\\p{Latin}\\p{Greek}\\p{Cyrillic}] & [\\p{Decomposition_Type=Canonical}]]"
	* will match all Latin, Greek and Cyrillic characters with a canonical decomposition.
	*
	* Are these three scripts enough? Do we want to collect them all at once and distribute by script,
	* or process them one script at a time. It's probably a good idea to build a single table for
	* however many scripts there are.
	*
	* It might be better to collect all the characters that have a canonical decomposition and just
	* sort them into however many scripts there are... unless we'll get characters in COMMON???
	*/
	public class CanonGSUBBuilder
	{
	static public String convertArabicString(int type, int ligature, String decomp, ClassTable isolClassTable)
	{
	int leftType = ArabicShaping.VALUE_NONE;
	int rightType = ArabicShaping.VALUE_NONE;

	switch (type) {
	case UCharacter.DecompositionType.ISOLATED:
	break;

	case UCharacter.DecompositionType.FINAL:
	rightType = ArabicShaping.VALUE_LEFT;
	break;

	case UCharacter.DecompositionType.INITIAL:
	leftType = ArabicShaping.VALUE_RIGHT;
	break;

	case UCharacter.DecompositionType.MEDIAL:
	rightType = ArabicShaping.VALUE_LEFT;
	leftType = ArabicShaping.VALUE_RIGHT;
	break;

	default:
	return decomp + UCharacter.toString(ligature);
	}

	char[] chars = decomp.toCharArray();

	ArabicShaping.shape(chars, leftType, rightType, isolClassTable);

	return new String(chars) + UCharacter.toString(ligature);
	}

	static void buildArabicContextualForms(ArabicCharacterData data, ClassTable initClassTable, ClassTable mediClassTable,
	ClassTable finaClassTable, ClassTable isolClassTable)
	{
	System.out.print("Finding Arabic contextual forms... ");

	for (int i = 0; i < data.countRecords(); i += 1) {
	ArabicCharacterData.Record record = data.getRecord(i);
	String decomposition = record.getDecomposition();

	if (decomposition != null && decomposition.length() == 1) {
	int contextual = record.getCodePoint();
	int isolated = UTF16.charAt(record.getDecomposition(), 0);

	switch (record.getDecompositionType()) {
	case UCharacter.DecompositionType.INITIAL:
	initClassTable.addMapping(isolated, contextual);
	break;

	case UCharacter.DecompositionType.MEDIAL:
	mediClassTable.addMapping(isolated, contextual);
	break;

	case UCharacter.DecompositionType.FINAL:
	finaClassTable.addMapping(isolated, contextual);
	break;

	case UCharacter.DecompositionType.ISOLATED:
	isolClassTable.addMapping(isolated, contextual);
	break;

	default:
	// issue some error message?
	break;
	}
	}
	}

	System.out.println("Done.");
	}

	static LigatureTree buildArabicLigatureTree(ArabicCharacterData data, ClassTable isolClassTable)
	{
	LigatureTree contextualTree = new LigatureTree();
	int ligatureCount = 0;

	System.out.print("Building Arabic ligature tree... ");

	for (int i = 0; i < data.countRecords(); i += 1) {
	ArabicCharacterData.Record record = data.getRecord(i);
	String decomposition = record.getDecomposition();

	if (decomposition != null && decomposition.length() > 1) {
	int ligature = record.getCodePoint();
	int decompType = record.getDecompositionType();

	switch (decompType) {
	case UCharacter.DecompositionType.FINAL:
	case UCharacter.DecompositionType.INITIAL:
	case UCharacter.DecompositionType.MEDIAL:
	case UCharacter.DecompositionType.ISOLATED:
	contextualTree.insert(convertArabicString(decompType, ligature, decomposition, isolClassTable));
	ligatureCount += 1;
	break;

	case UCharacter.DecompositionType.CANONICAL:
	//cannonicalTree.insert(decomposition + UCharacter.toString(ligature));
	break;
	}
	}
	}

	System.out.println(ligatureCount + " ligatures.");

	return contextualTree;
	}

	static final int SIMPLE_GLYPH = 1;
	static final int LIGATURE_GLYPH = 2;
	static final int MARK_GLYPH = 3;
	static final int COMPONENT_GLYPH = 4;

	static final int categoryClassMap[] = {
	0, // UNASSIGNED
	SIMPLE_GLYPH, // UPPERCASE_LETTER
	SIMPLE_GLYPH, // LOWERCASE_LETTER
	SIMPLE_GLYPH, // TITLECASE_LETTER
	SIMPLE_GLYPH, // MODIFIER_LETTER
	SIMPLE_GLYPH, // OTHER_LETTER
	MARK_GLYPH, // NON_SPACING_MARK
	MARK_GLYPH, // ENCLOSING_MARK ??
	MARK_GLYPH, // COMBINING_SPACING_MARK ??
	SIMPLE_GLYPH, // DECIMAL_NUMBER
	SIMPLE_GLYPH, // LETTER_NUMBER
	SIMPLE_GLYPH, // OTHER_NUMBER;
	0, // SPACE_SEPARATOR
	0, // LINE_SEPARATOR
	0, // PARAGRAPH_SEPARATOR
	0, // CONTROL
	0, // FORMAT
	0, // PRIVATE_USE
	0, // SURROGATE
	SIMPLE_GLYPH, // DASH_PUNCTUATION
	SIMPLE_GLYPH, // START_PUNCTUATION
	SIMPLE_GLYPH, // END_PUNCTUATION
	SIMPLE_GLYPH, // CONNECTOR_PUNCTUATION
	SIMPLE_GLYPH, // OTHER_PUNCTUATION
	SIMPLE_GLYPH, // MATH_SYMBOL;
	SIMPLE_GLYPH, // CURRENCY_SYMBOL
	SIMPLE_GLYPH, // MODIFIER_SYMBOL
	SIMPLE_GLYPH, // OTHER_SYMBOL
	SIMPLE_GLYPH, // INITIAL_PUNCTUATION
	SIMPLE_GLYPH // FINAL_PUNCTUATION
	};

	static int getGlyphClass(ArabicCharacterData.Record record)
	{
	String decomp = record.getDecomposition();

	if (decomp != null && decomp.length() > 1) {
	return LIGATURE_GLYPH;
	}

	return categoryClassMap[record.getGeneralCategory()];
	}

	static void addArabicGlyphClasses(ArabicCharacterData data, ClassTable classTable)
	{
	System.out.print("Adding Arabic glyph classes... ");

	for (int i = 0; i < data.countRecords(); i += 1) {
	ArabicCharacterData.Record record = data.getRecord(i);
	classTable.addMapping(record.getCodePoint(), getGlyphClass(record));
	}

	System.out.println("Done.");
	}

	private static void buildArabicTables(ScriptList scriptList, FeatureList featureList,
	LookupList lookupList, ClassTable classTable) {
	// TODO: Might want to have the ligature table builder explicitly check for ligatures
	// which start with space and tatweel rather than pulling them out here...
	UnicodeSet arabicBlock = new UnicodeSet("[[\\p{block=Arabic}] & [[:Cf:][:Po:][:So:][:Mn:][:Nd:][:Lm:]]]");
	UnicodeSet oddLigatures = new UnicodeSet("[\\uFC5E-\\uFC63\\uFCF2-\\uFCF4\\uFE70-\\uFE7F]");
	UnicodeSet arabicLetters = new UnicodeSet("[\\p{Arabic}]");
	ArabicCharacterData arabicData = ArabicCharacterData.factory(arabicLetters.addAll(arabicBlock).removeAll(oddLigatures));

	addArabicGlyphClasses(arabicData, classTable);

	ClassTable initClassTable = new ClassTable();
	ClassTable mediClassTable = new ClassTable();
	ClassTable finaClassTable = new ClassTable();
	ClassTable isolClassTable = new ClassTable();

	buildArabicContextualForms(arabicData, initClassTable, mediClassTable, finaClassTable, isolClassTable);
	isolClassTable.snapshot();
	LigatureTree ligaTree = buildArabicLigatureTree(arabicData, isolClassTable);

	LigatureTreeWalker ligaWalker = new LigatureTreeWalker();

	ligaTree.walk(ligaWalker);

	Lookup initLookup, mediLookup, finaLookup, ligaLookup;

	initLookup = new Lookup(Lookup.GSST_Single, 0);
	initLookup.addSubtable(initClassTable);

	mediLookup = new Lookup(Lookup.GSST_Single, 0);
	mediLookup.addSubtable(mediClassTable);

	finaLookup = new Lookup(Lookup.GSST_Single, 0);
	finaLookup.addSubtable(finaClassTable);

	ligaLookup = new Lookup(Lookup.GSST_Ligature, Lookup.LF_IgnoreMarks);
	ligaLookup.addSubtable(ligaWalker);

	Feature init = new Feature("init");
	Feature medi = new Feature("medi");
	Feature fina = new Feature("fina");
	Feature liga = new Feature("liga");

	init.addLookup(lookupList.addLookup(initLookup));
	medi.addLookup(lookupList.addLookup(mediLookup));
	fina.addLookup(lookupList.addLookup(finaLookup));
	liga.addLookup(lookupList.addLookup(ligaLookup));

	featureList.addFeature(init);
	featureList.addFeature(medi);
	featureList.addFeature(fina);
	featureList.addFeature(liga);

	scriptList.addFeature("arab", "(default)", init);
	scriptList.addFeature("arab", "(default)", medi);
	scriptList.addFeature("arab", "(default)", fina);
	scriptList.addFeature("arab", "(default)", liga);

	System.out.println();
	}

	public static void buildLigatureTree(CanonicalCharacterData data, int script, LigatureTree ligatureTree)
	{
	int ligatureCount = 0;

	System.out.print("building composition ligature tree for " + UScript.getName(script) + "... ");

	for (int i = 0; i < data.countRecords(script); i += 1) {
	CanonicalCharacterData.Record record = data.getRecord(script, i);
	String composed = UCharacter.toString(record.getComposedCharacter());

	for (int e = 0; e < record.countEquivalents(); e += 1) {
	String equivalent = record.getEquivalent(e);

	ligatureTree.insert(equivalent + composed);
	ligatureCount += 1;
	}
	}

	System.out.println(ligatureCount + " ligatures.");
	}

	public static DecompTable[] buildDecompTables(CanonicalCharacterData data, int script)
	{
	int maxDecompCount = data.getMaxEquivalents(script);
	DecompTable[] decompTables = new DecompTable[maxDecompCount];

	System.out.print("Building decompositon tables for " + UScript.getName(script) +
	"... total decompositions: " + data.countRecords(script) +
	", max: " + maxDecompCount + "...");

	for (int i = 0; i < maxDecompCount; i += 1) {
	DecompTable table = new DecompTable();

	for (int r = 0; r < data.countRecords(script); r += 1) {
	CanonicalCharacterData.Record record = data.getRecord(script, r);

	if (record.countEquivalents() > i) {
	table.add(record.getComposedCharacter(), record.getEquivalent(i));
	}
	}

	decompTables[i] = table;
	}

	System.out.println(" Done.");

	return decompTables;
	}

	public static int[] buildLookups(CanonicalCharacterData data, LookupList lookupList, int script)
	{
	int[] lookups = new int[2];

	DecompTable[] decompTables = buildDecompTables(data, script);

	LigatureTree compTree = new LigatureTree();

	buildLigatureTree(data, script, compTree);

	System.out.println();

	LigatureTreeWalker compWalker = new LigatureTreeWalker();

	compTree.walk(compWalker);

	Lookup compLookup, dcmpLookup;
	//int compLookupIndex, dcmpLookupIndex;

	compLookup = new Lookup(Lookup.GSST_Ligature, 0);
	compLookup.addSubtable(compWalker);

	dcmpLookup = new Lookup(Lookup.GSST_Multiple, 0);
	for (int i = 0; i < decompTables.length; i += 1) {
	dcmpLookup.addSubtable(decompTables[i]);
	}

	lookups[0] = lookupList.addLookup(compLookup);
	lookups[1] = lookupList.addLookup(dcmpLookup);

	return lookups;
	}

	public static void addLookups(Feature feature, int[] lookups)
	{
	for (int i = 0; i < lookups.length; i += 1) {
	feature.addLookup(lookups[i]);
	}
	}

	/*
	* Hebrew mark order taken from the SBL Hebrew Font manual
	* Arabic mark order per Thomas Milo: hamza < shadda < combining_alef < sukun, vowel_marks < madda < qur'anic_marks
	*/
	public static ClassTable buildCombiningClassTable()
	{
	UnicodeSet markSet = new UnicodeSet("[\\P{CanonicalCombiningClass=0}]");
	ClassTable exceptions = new ClassTable();
	ClassTable combiningClasses = new ClassTable();
	int markCount = markSet.size();

	exceptions.addMapping(0x05C1, 10); // Point Shin Dot
	exceptions.addMapping(0x05C2, 11); // Point Sin Dot
	exceptions.addMapping(0x05BC, 21); // Point Dagesh or Mapiq
	exceptions.addMapping(0x05BF, 23); // Point Rafe
	exceptions.addMapping(0x05B9, 27); // Point Holam
	exceptions.addMapping(0x0323, 220); // Comb. Dot Below (low punctum)
	exceptions.addMapping(0x0591, 220); // Accent Etnahta
	exceptions.addMapping(0x0596, 220); // Accent Tipeha
	exceptions.addMapping(0x059B, 220); // Accent Tevir
	exceptions.addMapping(0x05A3, 220); // Accent Munah
	exceptions.addMapping(0x05A4, 220); // Accent Mahapakh
	exceptions.addMapping(0x05A5, 220); // Accent Merkha
	exceptions.addMapping(0x05A6, 220); // Accent Merkha Kefula
	exceptions.addMapping(0x05A7, 220); // Accent Darga
	exceptions.addMapping(0x05AA, 220); // Accent Yerah Ben Yomo
	exceptions.addMapping(0x05B0, 220); // Point Sheva
	exceptions.addMapping(0x05B1, 220); // Point Hataf Segol
	exceptions.addMapping(0x05B2, 220); // Point Hataf Patah
	exceptions.addMapping(0x05B3, 220); // Point Hataf Qamats
	exceptions.addMapping(0x05B4, 220); // Point Hiriq
	exceptions.addMapping(0x05B5, 220); // Point Tsere
	exceptions.addMapping(0x05B6, 220); // Point Segol
	exceptions.addMapping(0x05B7, 220); // Point Patah
	exceptions.addMapping(0x05B8, 220); // Point Qamats
	exceptions.addMapping(0x05BB, 220); // Point Qubuts
	exceptions.addMapping(0x05BD, 220); // Point Meteg
	exceptions.addMapping(0x059A, 222); // Accent Yetiv
	exceptions.addMapping(0x05AD, 222); // Accent Dehi
	exceptions.addMapping(0x05C4, 230); // Mark Upper Dot (high punctum)
	exceptions.addMapping(0x0593, 230); // Accent Shalshelet
	exceptions.addMapping(0x0594, 230); // Accent Zaqef Qatan
	exceptions.addMapping(0x0595, 230); // Accent Zaqef Gadol
	exceptions.addMapping(0x0597, 230); // Accent Revia
	exceptions.addMapping(0x0598, 230); // Accent Zarqa
	exceptions.addMapping(0x059F, 230); // Accent Qarney Para
	exceptions.addMapping(0x059E, 230); // Accent Gershayim
	exceptions.addMapping(0x059D, 230); // Accent Geresh Muqdam
	exceptions.addMapping(0x059C, 230); // Accent Geresh
	exceptions.addMapping(0x0592, 230); // Accent Segolta
	exceptions.addMapping(0x05A0, 230); // Accent Telisha Gedola
	exceptions.addMapping(0x05AC, 230); // Accent Iluy
	exceptions.addMapping(0x05A8, 230); // Accent Qadma
	exceptions.addMapping(0x05AB, 230); // Accent Ole
	exceptions.addMapping(0x05AF, 230); // Mark Masora Circle
	exceptions.addMapping(0x05A1, 230); // Accent Pazer
	//exceptions.addMapping(0x0307, 230); // Mark Number/Masora Dot
	exceptions.addMapping(0x05AE, 232); // Accent Zinor
	exceptions.addMapping(0x05A9, 232); // Accent Telisha Qetana
	exceptions.addMapping(0x0599, 232); // Accent Pashta

	exceptions.addMapping(0x0655, 27); // ARABIC HAMZA BELOW
	exceptions.addMapping(0x0654, 27); // ARABIC HAMZA ABOVE

	exceptions.addMapping(0x0651, 28); // ARABIC SHADDA

	exceptions.addMapping(0x0656, 29); // ARABIC SUBSCRIPT ALEF
	exceptions.addMapping(0x0670, 29); // ARABIC LETTER SUPERSCRIPT ALEF

	exceptions.addMapping(0x064D, 30); // ARABIC KASRATAN
	exceptions.addMapping(0x0650, 30); // ARABIC KASRA

	exceptions.addMapping(0x0652, 31); // ARABIC SUKUN
	exceptions.addMapping(0x06E1, 31); // ARABIC SMALL HIGH DOTLESS HEAD OF KHAH

	exceptions.addMapping(0x064B, 31); // ARABIC FATHATAN
	exceptions.addMapping(0x064C, 31); // ARABIC DAMMATAN
	exceptions.addMapping(0x064E, 31); // ARABIC FATHA
	exceptions.addMapping(0x064F, 31); // ARABIC DAMMA
	exceptions.addMapping(0x0657, 31); // ARABIC INVERTED DAMMA
	exceptions.addMapping(0x0658, 31); // ARABIC MARK NOON GHUNNA

	exceptions.addMapping(0x0653, 32); // ARABIC MADDAH ABOVE

	exceptions.snapshot();

	for (int i = 0; i < markCount; i += 1) {
	int mark = markSet.charAt(i);
	int markClass = exceptions.getGlyphClassID(mark);

	if (markClass == 0) {
	markClass = UCharacter.getCombiningClass(mark);
	}

	combiningClasses.addMapping(mark, markClass);
	}

	combiningClasses.snapshot();
	return combiningClasses;
	}

	public static void buildDecompTables(String fileName)
	{
	// F900 - FAFF are compatibility ideographs. They all decompose to a single other character, and can be ignored.
	//UnicodeSet decompSet = new UnicodeSet("[[[\\P{Hangul}] & [\\p{DecompositionType=Canonical}]] - [\uF900-\uFAFF]]");
	UnicodeSet decompSet = new UnicodeSet("[[\\p{DecompositionType=Canonical}] & [\\P{FullCompositionExclusion}] & [\\P{Hangul}]]");
	CanonicalCharacterData data = CanonicalCharacterData.factory(decompSet);
	ClassTable classTable = new ClassTable();

	LookupList lookupList = new LookupList();
	FeatureList featureList = new FeatureList();
	ScriptList scriptList = new ScriptList();

	// build common, inherited lookups...
	// int[] commonLookups = buildLookups(data, lookupList, UScript.COMMON);
	// int[] inheritedLookups = buildLookups(data, lookupList, UScript.INHERITED);

	for (int script = 0; script < UScript.CODE_LIMIT; script += 1) {

	// This is a bit lame, but it's the only way I can think of
	// to make this work w/o knowing the values of COMMON and INHERITED...
	if (script == UScript.COMMON \|\| script == UScript.INHERITED \|\|
	data.getMaxEquivalents(script) == 0) {
	continue;
	}

	int[] lookups = buildLookups(data, lookupList, script);

	Feature ccmp = new Feature("ccmp");

	addLookups(ccmp, lookups);
	// addLookups(ccmp, commonLookups);
	// addLookups(ccmp, inheritedLookups);

	featureList.addFeature(ccmp);

	String scriptTag = TagUtilities.tagLabel(UScript.getShortName(script));

	scriptList.addFeature(scriptTag, "(default)", ccmp);

	if (script == UScript.ARABIC) {
	buildArabicTables(scriptList, featureList, lookupList, classTable);
	}
	}

	featureList.finalizeFeatureList();

	ClassTable markClassTable = buildCombiningClassTable();

	GSUBWriter gsubWriter = new GSUBWriter("Canon", scriptList, featureList, lookupList);
	GDEFWriter gdefWriter = new GDEFWriter("Canon", classTable, markClassTable);
	String[] includeFiles = {"LETypes.h", "CanonShaping.h"};

	LigatureModuleWriter writer = new LigatureModuleWriter();

	writer.openFile(fileName);
	writer.writeHeader(null, includeFiles);
	writer.writeTable(gsubWriter);
	writer.writeTable(gdefWriter);
	writer.writeTrailer();
	writer.closeFile();
	}

	public static void main(String[] args)
	{
	buildDecompTables(args[0]);
	}
	}