blob: aade1f96ad23b148ad35887870a40a12d5a149bc [file] [log] [blame]
/**
*******************************************************************************
* Copyright (C) 2002-2010, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.dev.tool.layout;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* @author Eric Mader
*
* Notes:
*
* The property \p{Decomposition_Type=Canonical} will match all characters with a canonical
* decomposition.
*
* So "[[\\p{Latin}\\p{Greek}\\p{Cyrillic}] & [\\p{Decomposition_Type=Canonical}]]"
* will match all Latin, Greek and Cyrillic characters with a canonical decomposition.
*
* Are these three scripts enough? Do we want to collect them all at once and distribute by script,
* or process them one script at a time. It's probably a good idea to build a single table for
* however many scripts there are.
*
* It might be better to collect all the characters that have a canonical decomposition and just
* sort them into however many scripts there are... unless we'll get characters in COMMON???
*/
public class CanonGSUBBuilder
{
static public String convertArabicString(int type, int ligature, String decomp, ClassTable isolClassTable)
{
int leftType = ArabicShaping.VALUE_NONE;
int rightType = ArabicShaping.VALUE_NONE;
switch (type) {
case UCharacter.DecompositionType.ISOLATED:
break;
case UCharacter.DecompositionType.FINAL:
rightType = ArabicShaping.VALUE_LEFT;
break;
case UCharacter.DecompositionType.INITIAL:
leftType = ArabicShaping.VALUE_RIGHT;
break;
case UCharacter.DecompositionType.MEDIAL:
rightType = ArabicShaping.VALUE_LEFT;
leftType = ArabicShaping.VALUE_RIGHT;
break;
default:
return decomp + UCharacter.toString(ligature);
}
char[] chars = decomp.toCharArray();
ArabicShaping.shape(chars, leftType, rightType, isolClassTable);
return new String(chars) + UCharacter.toString(ligature);
}
static void buildArabicContextualForms(ArabicCharacterData data, ClassTable initClassTable, ClassTable mediClassTable,
ClassTable finaClassTable, ClassTable isolClassTable)
{
System.out.print("Finding Arabic contextual forms... ");
for (int i = 0; i < data.countRecords(); i += 1) {
ArabicCharacterData.Record record = data.getRecord(i);
String decomposition = record.getDecomposition();
if (decomposition != null && decomposition.length() == 1) {
int contextual = record.getCodePoint();
int isolated = UTF16.charAt(record.getDecomposition(), 0);
switch (record.getDecompositionType()) {
case UCharacter.DecompositionType.INITIAL:
initClassTable.addMapping(isolated, contextual);
break;
case UCharacter.DecompositionType.MEDIAL:
mediClassTable.addMapping(isolated, contextual);
break;
case UCharacter.DecompositionType.FINAL:
finaClassTable.addMapping(isolated, contextual);
break;
case UCharacter.DecompositionType.ISOLATED:
isolClassTable.addMapping(isolated, contextual);
break;
default:
// issue some error message?
break;
}
}
}
System.out.println("Done.");
}
static LigatureTree buildArabicLigatureTree(ArabicCharacterData data, ClassTable isolClassTable)
{
LigatureTree contextualTree = new LigatureTree();
int ligatureCount = 0;
System.out.print("Building Arabic ligature tree... ");
for (int i = 0; i < data.countRecords(); i += 1) {
ArabicCharacterData.Record record = data.getRecord(i);
String decomposition = record.getDecomposition();
if (decomposition != null && decomposition.length() > 1) {
int ligature = record.getCodePoint();
int decompType = record.getDecompositionType();
switch (decompType) {
case UCharacter.DecompositionType.FINAL:
case UCharacter.DecompositionType.INITIAL:
case UCharacter.DecompositionType.MEDIAL:
case UCharacter.DecompositionType.ISOLATED:
contextualTree.insert(convertArabicString(decompType, ligature, decomposition, isolClassTable));
ligatureCount += 1;
break;
case UCharacter.DecompositionType.CANONICAL:
//cannonicalTree.insert(decomposition + UCharacter.toString(ligature));
break;
}
}
}
System.out.println(ligatureCount + " ligatures.");
return contextualTree;
}
static final int SIMPLE_GLYPH = 1;
static final int LIGATURE_GLYPH = 2;
static final int MARK_GLYPH = 3;
static final int COMPONENT_GLYPH = 4;
static final int categoryClassMap[] = {
0, // UNASSIGNED
SIMPLE_GLYPH, // UPPERCASE_LETTER
SIMPLE_GLYPH, // LOWERCASE_LETTER
SIMPLE_GLYPH, // TITLECASE_LETTER
SIMPLE_GLYPH, // MODIFIER_LETTER
SIMPLE_GLYPH, // OTHER_LETTER
MARK_GLYPH, // NON_SPACING_MARK
MARK_GLYPH, // ENCLOSING_MARK ??
MARK_GLYPH, // COMBINING_SPACING_MARK ??
SIMPLE_GLYPH, // DECIMAL_NUMBER
SIMPLE_GLYPH, // LETTER_NUMBER
SIMPLE_GLYPH, // OTHER_NUMBER;
0, // SPACE_SEPARATOR
0, // LINE_SEPARATOR
0, // PARAGRAPH_SEPARATOR
0, // CONTROL
0, // FORMAT
0, // PRIVATE_USE
0, // SURROGATE
SIMPLE_GLYPH, // DASH_PUNCTUATION
SIMPLE_GLYPH, // START_PUNCTUATION
SIMPLE_GLYPH, // END_PUNCTUATION
SIMPLE_GLYPH, // CONNECTOR_PUNCTUATION
SIMPLE_GLYPH, // OTHER_PUNCTUATION
SIMPLE_GLYPH, // MATH_SYMBOL;
SIMPLE_GLYPH, // CURRENCY_SYMBOL
SIMPLE_GLYPH, // MODIFIER_SYMBOL
SIMPLE_GLYPH, // OTHER_SYMBOL
SIMPLE_GLYPH, // INITIAL_PUNCTUATION
SIMPLE_GLYPH // FINAL_PUNCTUATION
};
static int getGlyphClass(ArabicCharacterData.Record record)
{
String decomp = record.getDecomposition();
if (decomp != null && decomp.length() > 1) {
return LIGATURE_GLYPH;
}
return categoryClassMap[record.getGeneralCategory()];
}
static void addArabicGlyphClasses(ArabicCharacterData data, ClassTable classTable)
{
System.out.print("Adding Arabic glyph classes... ");
for (int i = 0; i < data.countRecords(); i += 1) {
ArabicCharacterData.Record record = data.getRecord(i);
classTable.addMapping(record.getCodePoint(), getGlyphClass(record));
}
System.out.println("Done.");
}
private static void buildArabicTables(ScriptList scriptList, FeatureList featureList,
LookupList lookupList, ClassTable classTable) {
// TODO: Might want to have the ligature table builder explicitly check for ligatures
// which start with space and tatweel rather than pulling them out here...
UnicodeSet arabicBlock = new UnicodeSet("[[\\p{block=Arabic}] & [[:Cf:][:Po:][:So:][:Mn:][:Nd:][:Lm:]]]");
UnicodeSet oddLigatures = new UnicodeSet("[\\uFC5E-\\uFC63\\uFCF2-\\uFCF4\\uFE70-\\uFE7F]");
UnicodeSet arabicLetters = new UnicodeSet("[\\p{Arabic}]");
ArabicCharacterData arabicData = ArabicCharacterData.factory(arabicLetters.addAll(arabicBlock).removeAll(oddLigatures));
addArabicGlyphClasses(arabicData, classTable);
ClassTable initClassTable = new ClassTable();
ClassTable mediClassTable = new ClassTable();
ClassTable finaClassTable = new ClassTable();
ClassTable isolClassTable = new ClassTable();
buildArabicContextualForms(arabicData, initClassTable, mediClassTable, finaClassTable, isolClassTable);
isolClassTable.snapshot();
LigatureTree ligaTree = buildArabicLigatureTree(arabicData, isolClassTable);
LigatureTreeWalker ligaWalker = new LigatureTreeWalker();
ligaTree.walk(ligaWalker);
Lookup initLookup, mediLookup, finaLookup, ligaLookup;
initLookup = new Lookup(Lookup.GSST_Single, 0);
initLookup.addSubtable(initClassTable);
mediLookup = new Lookup(Lookup.GSST_Single, 0);
mediLookup.addSubtable(mediClassTable);
finaLookup = new Lookup(Lookup.GSST_Single, 0);
finaLookup.addSubtable(finaClassTable);
ligaLookup = new Lookup(Lookup.GSST_Ligature, Lookup.LF_IgnoreMarks);
ligaLookup.addSubtable(ligaWalker);
Feature init = new Feature("init");
Feature medi = new Feature("medi");
Feature fina = new Feature("fina");
Feature liga = new Feature("liga");
init.addLookup(lookupList.addLookup(initLookup));
medi.addLookup(lookupList.addLookup(mediLookup));
fina.addLookup(lookupList.addLookup(finaLookup));
liga.addLookup(lookupList.addLookup(ligaLookup));
featureList.addFeature(init);
featureList.addFeature(medi);
featureList.addFeature(fina);
featureList.addFeature(liga);
scriptList.addFeature("arab", "(default)", init);
scriptList.addFeature("arab", "(default)", medi);
scriptList.addFeature("arab", "(default)", fina);
scriptList.addFeature("arab", "(default)", liga);
System.out.println();
}
public static void buildLigatureTree(CanonicalCharacterData data, int script, LigatureTree ligatureTree)
{
int ligatureCount = 0;
System.out.print("building composition ligature tree for " + UScript.getName(script) + "... ");
for (int i = 0; i < data.countRecords(script); i += 1) {
CanonicalCharacterData.Record record = data.getRecord(script, i);
String composed = UCharacter.toString(record.getComposedCharacter());
for (int e = 0; e < record.countEquivalents(); e += 1) {
String equivalent = record.getEquivalent(e);
ligatureTree.insert(equivalent + composed);
ligatureCount += 1;
}
}
System.out.println(ligatureCount + " ligatures.");
}
public static DecompTable[] buildDecompTables(CanonicalCharacterData data, int script)
{
int maxDecompCount = data.getMaxEquivalents(script);
DecompTable[] decompTables = new DecompTable[maxDecompCount];
System.out.print("Building decompositon tables for " + UScript.getName(script) +
"... total decompositions: " + data.countRecords(script) +
", max: " + maxDecompCount + "...");
for (int i = 0; i < maxDecompCount; i += 1) {
DecompTable table = new DecompTable();
for (int r = 0; r < data.countRecords(script); r += 1) {
CanonicalCharacterData.Record record = data.getRecord(script, r);
if (record.countEquivalents() > i) {
table.add(record.getComposedCharacter(), record.getEquivalent(i));
}
}
decompTables[i] = table;
}
System.out.println(" Done.");
return decompTables;
}
public static int[] buildLookups(CanonicalCharacterData data, LookupList lookupList, int script)
{
int[] lookups = new int[2];
DecompTable[] decompTables = buildDecompTables(data, script);
LigatureTree compTree = new LigatureTree();
buildLigatureTree(data, script, compTree);
System.out.println();
LigatureTreeWalker compWalker = new LigatureTreeWalker();
compTree.walk(compWalker);
Lookup compLookup, dcmpLookup;
//int compLookupIndex, dcmpLookupIndex;
compLookup = new Lookup(Lookup.GSST_Ligature, 0);
compLookup.addSubtable(compWalker);
dcmpLookup = new Lookup(Lookup.GSST_Multiple, 0);
for (int i = 0; i < decompTables.length; i += 1) {
dcmpLookup.addSubtable(decompTables[i]);
}
lookups[0] = lookupList.addLookup(compLookup);
lookups[1] = lookupList.addLookup(dcmpLookup);
return lookups;
}
public static void addLookups(Feature feature, int[] lookups)
{
for (int i = 0; i < lookups.length; i += 1) {
feature.addLookup(lookups[i]);
}
}
/*
* Hebrew mark order taken from the SBL Hebrew Font manual
* Arabic mark order per Thomas Milo: hamza < shadda < combining_alef < sukun, vowel_marks < madda < qur'anic_marks
*/
public static ClassTable buildCombiningClassTable()
{
UnicodeSet markSet = new UnicodeSet("[\\P{CanonicalCombiningClass=0}]");
ClassTable exceptions = new ClassTable();
ClassTable combiningClasses = new ClassTable();
int markCount = markSet.size();
exceptions.addMapping(0x05C1, 10); // Point Shin Dot
exceptions.addMapping(0x05C2, 11); // Point Sin Dot
exceptions.addMapping(0x05BC, 21); // Point Dagesh or Mapiq
exceptions.addMapping(0x05BF, 23); // Point Rafe
exceptions.addMapping(0x05B9, 27); // Point Holam
exceptions.addMapping(0x0323, 220); // Comb. Dot Below (low punctum)
exceptions.addMapping(0x0591, 220); // Accent Etnahta
exceptions.addMapping(0x0596, 220); // Accent Tipeha
exceptions.addMapping(0x059B, 220); // Accent Tevir
exceptions.addMapping(0x05A3, 220); // Accent Munah
exceptions.addMapping(0x05A4, 220); // Accent Mahapakh
exceptions.addMapping(0x05A5, 220); // Accent Merkha
exceptions.addMapping(0x05A6, 220); // Accent Merkha Kefula
exceptions.addMapping(0x05A7, 220); // Accent Darga
exceptions.addMapping(0x05AA, 220); // Accent Yerah Ben Yomo
exceptions.addMapping(0x05B0, 220); // Point Sheva
exceptions.addMapping(0x05B1, 220); // Point Hataf Segol
exceptions.addMapping(0x05B2, 220); // Point Hataf Patah
exceptions.addMapping(0x05B3, 220); // Point Hataf Qamats
exceptions.addMapping(0x05B4, 220); // Point Hiriq
exceptions.addMapping(0x05B5, 220); // Point Tsere
exceptions.addMapping(0x05B6, 220); // Point Segol
exceptions.addMapping(0x05B7, 220); // Point Patah
exceptions.addMapping(0x05B8, 220); // Point Qamats
exceptions.addMapping(0x05BB, 220); // Point Qubuts
exceptions.addMapping(0x05BD, 220); // Point Meteg
exceptions.addMapping(0x059A, 222); // Accent Yetiv
exceptions.addMapping(0x05AD, 222); // Accent Dehi
exceptions.addMapping(0x05C4, 230); // Mark Upper Dot (high punctum)
exceptions.addMapping(0x0593, 230); // Accent Shalshelet
exceptions.addMapping(0x0594, 230); // Accent Zaqef Qatan
exceptions.addMapping(0x0595, 230); // Accent Zaqef Gadol
exceptions.addMapping(0x0597, 230); // Accent Revia
exceptions.addMapping(0x0598, 230); // Accent Zarqa
exceptions.addMapping(0x059F, 230); // Accent Qarney Para
exceptions.addMapping(0x059E, 230); // Accent Gershayim
exceptions.addMapping(0x059D, 230); // Accent Geresh Muqdam
exceptions.addMapping(0x059C, 230); // Accent Geresh
exceptions.addMapping(0x0592, 230); // Accent Segolta
exceptions.addMapping(0x05A0, 230); // Accent Telisha Gedola
exceptions.addMapping(0x05AC, 230); // Accent Iluy
exceptions.addMapping(0x05A8, 230); // Accent Qadma
exceptions.addMapping(0x05AB, 230); // Accent Ole
exceptions.addMapping(0x05AF, 230); // Mark Masora Circle
exceptions.addMapping(0x05A1, 230); // Accent Pazer
//exceptions.addMapping(0x0307, 230); // Mark Number/Masora Dot
exceptions.addMapping(0x05AE, 232); // Accent Zinor
exceptions.addMapping(0x05A9, 232); // Accent Telisha Qetana
exceptions.addMapping(0x0599, 232); // Accent Pashta
exceptions.addMapping(0x0655, 27); // ARABIC HAMZA BELOW
exceptions.addMapping(0x0654, 27); // ARABIC HAMZA ABOVE
exceptions.addMapping(0x0651, 28); // ARABIC SHADDA
exceptions.addMapping(0x0656, 29); // ARABIC SUBSCRIPT ALEF
exceptions.addMapping(0x0670, 29); // ARABIC LETTER SUPERSCRIPT ALEF
exceptions.addMapping(0x064D, 30); // ARABIC KASRATAN
exceptions.addMapping(0x0650, 30); // ARABIC KASRA
exceptions.addMapping(0x0652, 31); // ARABIC SUKUN
exceptions.addMapping(0x06E1, 31); // ARABIC SMALL HIGH DOTLESS HEAD OF KHAH
exceptions.addMapping(0x064B, 31); // ARABIC FATHATAN
exceptions.addMapping(0x064C, 31); // ARABIC DAMMATAN
exceptions.addMapping(0x064E, 31); // ARABIC FATHA
exceptions.addMapping(0x064F, 31); // ARABIC DAMMA
exceptions.addMapping(0x0657, 31); // ARABIC INVERTED DAMMA
exceptions.addMapping(0x0658, 31); // ARABIC MARK NOON GHUNNA
exceptions.addMapping(0x0653, 32); // ARABIC MADDAH ABOVE
exceptions.snapshot();
for (int i = 0; i < markCount; i += 1) {
int mark = markSet.charAt(i);
int markClass = exceptions.getGlyphClassID(mark);
if (markClass == 0) {
markClass = UCharacter.getCombiningClass(mark);
}
combiningClasses.addMapping(mark, markClass);
}
combiningClasses.snapshot();
return combiningClasses;
}
public static void buildDecompTables(String fileName)
{
// F900 - FAFF are compatibility ideographs. They all decompose to a single other character, and can be ignored.
//UnicodeSet decompSet = new UnicodeSet("[[[\\P{Hangul}] & [\\p{DecompositionType=Canonical}]] - [\uF900-\uFAFF]]");
UnicodeSet decompSet = new UnicodeSet("[[\\p{DecompositionType=Canonical}] & [\\P{FullCompositionExclusion}] & [\\P{Hangul}]]");
CanonicalCharacterData data = CanonicalCharacterData.factory(decompSet);
ClassTable classTable = new ClassTable();
LookupList lookupList = new LookupList();
FeatureList featureList = new FeatureList();
ScriptList scriptList = new ScriptList();
// build common, inherited lookups...
// int[] commonLookups = buildLookups(data, lookupList, UScript.COMMON);
// int[] inheritedLookups = buildLookups(data, lookupList, UScript.INHERITED);
for (int script = 0; script < UScript.CODE_LIMIT; script += 1) {
// This is a bit lame, but it's the only way I can think of
// to make this work w/o knowing the values of COMMON and INHERITED...
if (script == UScript.COMMON || script == UScript.INHERITED ||
data.getMaxEquivalents(script) == 0) {
continue;
}
int[] lookups = buildLookups(data, lookupList, script);
Feature ccmp = new Feature("ccmp");
addLookups(ccmp, lookups);
// addLookups(ccmp, commonLookups);
// addLookups(ccmp, inheritedLookups);
featureList.addFeature(ccmp);
String scriptTag = TagUtilities.tagLabel(UScript.getShortName(script));
scriptList.addFeature(scriptTag, "(default)", ccmp);
if (script == UScript.ARABIC) {
buildArabicTables(scriptList, featureList, lookupList, classTable);
}
}
featureList.finalizeFeatureList();
ClassTable markClassTable = buildCombiningClassTable();
GSUBWriter gsubWriter = new GSUBWriter("Canon", scriptList, featureList, lookupList);
GDEFWriter gdefWriter = new GDEFWriter("Canon", classTable, markClassTable);
String[] includeFiles = {"LETypes.h", "CanonShaping.h"};
LigatureModuleWriter writer = new LigatureModuleWriter();
writer.openFile(fileName);
writer.writeHeader(null, includeFiles);
writer.writeTable(gsubWriter);
writer.writeTable(gdefWriter);
writer.writeTrailer();
writer.closeFile();
}
public static void main(String[] args)
{
buildDecompTables(args[0]);
}
}