| #!/usr/bin/env python3 |
| # |
| # fontconfig/fc-lang/fc-lang.py |
| # |
| # Copyright © 2001-2002 Keith Packard |
| # Copyright © 2019 Tim-Philipp Müller |
| # |
| # Permission to use, copy, modify, distribute, and sell this software and its |
| # documentation for any purpose is hereby granted without fee, provided that |
| # the above copyright notice appear in all copies and that both that |
| # copyright notice and this permission notice appear in supporting |
| # documentation, and that the name of the author(s) not be used in |
| # advertising or publicity pertaining to distribution of the software without |
| # specific, written prior permission. The authors make no |
| # representations about the suitability of this software for any purpose. It |
| # is provided "as is" without express or implied warranty. |
| # |
| # THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, |
| # INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO |
| # EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR |
| # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, |
| # DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER |
| # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR |
| # PERFORMANCE OF THIS SOFTWARE. |
| |
| # fc-lang |
| # |
| # Read a set of language orthographies and build C declarations for |
| # charsets which can then be used to identify which languages are |
| # supported by a given font. |
| # |
| # TODO: this code is not very pythonic, a lot of it is a 1:1 translation |
| # of the C code and we could probably simplify it a bit |
| import argparse |
| import string |
| import sys |
| import os |
| |
| # we just store the leaves in a dict, we can order the leaves later if needed |
| class CharSet: |
| def __init__(self): |
| self.leaves = {} # leaf_number -> leaf data (= 16 uint32) |
| |
| def add_char(self, ucs4): |
| assert ucs4 < 0x01000000 |
| leaf_num = ucs4 >> 8 |
| if leaf_num in self.leaves: |
| leaf = self.leaves[leaf_num] |
| else: |
| leaf = [0, 0, 0, 0, 0, 0, 0, 0] # 256/32 = 8 |
| self.leaves[leaf_num] = leaf |
| leaf[(ucs4 & 0xff) >> 5] |= (1 << (ucs4 & 0x1f)) |
| #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf)) |
| |
| def del_char(self, ucs4): |
| assert ucs4 < 0x01000000 |
| leaf_num = ucs4 >> 8 |
| if leaf_num in self.leaves: |
| leaf = self.leaves[leaf_num] |
| leaf[(ucs4 & 0xff) >> 5] &= ~(1 << (ucs4 & 0x1f)) |
| # We don't bother removing the leaf if it's empty */ |
| #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf)) |
| |
| def equals(self, other_cs): |
| keys = sorted(self.leaves.keys()) |
| other_keys = sorted(other_cs.leaves.keys()) |
| if len(keys) != len(other_keys): |
| return False |
| for k1, k2 in zip(keys, other_keys): |
| if k1 != k2: |
| return False |
| if not leaves_equal(self.leaves[k1], other_cs.leaves[k2]): |
| return False |
| return True |
| |
| # Convert a file name into a name suitable for C declarations |
| def get_name(file_name): |
| return file_name.split('.')[0] |
| |
| # Convert a C name into a language name |
| def get_lang(c_name): |
| return c_name.replace('_', '-').replace(' ', '').lower() |
| |
| def read_orth_file(file_name): |
| lines = [] |
| with open(file_name, 'r', encoding='utf-8') as orth_file: |
| for num, line in enumerate(orth_file): |
| if line.startswith('include '): |
| include_fn = line[8:].strip() |
| lines += read_orth_file(include_fn) |
| else: |
| # remove comments and strip whitespaces |
| line = line.split('#')[0].strip() |
| line = line.split('\t')[0].strip() |
| # skip empty lines |
| if line: |
| lines += [(file_name, num, line)] |
| |
| return lines |
| |
| def leaves_equal(leaf1, leaf2): |
| for v1, v2 in zip(leaf1, leaf2): |
| if v1 != v2: |
| return False |
| return True |
| |
| # Build a single charset from a source file |
| # |
| # The file format is quite simple, either |
| # a single hex value or a pair separated with a dash |
| def parse_orth_file(file_name, lines): |
| charset = CharSet() |
| for fn, num, line in lines: |
| delete_char = line.startswith('-') |
| if delete_char: |
| line = line[1:] |
| if line.find('-') != -1: |
| parts = line.split('-') |
| elif line.find('..') != -1: |
| parts = line.split('..') |
| else: |
| parts = [line] |
| |
| start = int(parts.pop(0), 16) |
| end = start |
| if parts: |
| end = int(parts.pop(0), 16) |
| if parts: |
| print('ERROR: {} line {}: parse error (too many parts)'.format(fn, num)) |
| |
| for ucs4 in range(start, end+1): |
| if delete_char: |
| charset.del_char(ucs4) |
| else: |
| charset.add_char(ucs4) |
| |
| assert charset.equals(charset) # sanity check for the equals function |
| |
| return charset |
| |
| if __name__=='__main__': |
| parser = argparse.ArgumentParser() |
| parser.add_argument('orth_files', nargs='+', help='List of .orth files') |
| parser.add_argument('--directory', dest='directory', default=None) |
| parser.add_argument('--template', dest='template_file', default=None) |
| parser.add_argument('--output', dest='output_file', default=None) |
| |
| args = parser.parse_args() |
| |
| sets = [] |
| names = [] |
| langs = [] |
| country = [] |
| |
| total_leaves = 0 |
| |
| LangCountrySets = {} |
| |
| # Open output file |
| if args.output_file: |
| sys.stdout = open(args.output_file, 'w', encoding='utf-8') |
| |
| # Read the template file |
| if args.template_file: |
| tmpl_file = open(args.template_file, 'r', encoding='utf-8') |
| else: |
| tmpl_file = sys.stdin |
| |
| # Change into source dir if specified (after opening other files) |
| if args.directory: |
| os.chdir(args.directory) |
| |
| orth_entries = {} |
| for i, fn in enumerate(args.orth_files): |
| orth_entries[fn] = i |
| |
| for fn in sorted(orth_entries.keys()): |
| lines = read_orth_file(fn) |
| charset = parse_orth_file(fn, lines) |
| |
| sets.append(charset) |
| |
| name = get_name(fn) |
| names.append(name) |
| |
| lang = get_lang(name) |
| langs.append(lang) |
| if lang.find('-') != -1: |
| country.append(orth_entries[fn]) # maps to original index |
| language_family = lang.split('-')[0] |
| if not language_family in LangCountrySets: |
| LangCountrySets[language_family] = [] |
| LangCountrySets[language_family] += [orth_entries[fn]] |
| |
| total_leaves += len(charset.leaves) |
| |
| # Find unique leaves |
| leaves = [] |
| for s in sets: |
| for leaf_num in sorted(s.leaves.keys()): |
| leaf = s.leaves[leaf_num] |
| is_unique = True |
| for existing_leaf in leaves: |
| if leaves_equal(leaf, existing_leaf): |
| is_unique = False |
| break |
| #print('unique: ', is_unique) |
| if is_unique: |
| leaves.append(leaf) |
| |
| # Find duplicate charsets |
| duplicate = [] |
| for i, s in enumerate(sets): |
| dup_num = None |
| if i >= 1: |
| for j, s_cmp in enumerate(sets): |
| if j >= i: |
| break |
| if s_cmp.equals(s): |
| dup_num = j |
| break |
| |
| duplicate.append(dup_num) |
| |
| tn = 0 |
| off = {} |
| for i, s in enumerate(sets): |
| if duplicate[i]: |
| continue |
| off[i] = tn |
| tn += len(s.leaves) |
| |
| # Scan the input until the marker is found |
| # FIXME: this is a bit silly really, might just as well hardcode |
| # the license header in the script and drop the template |
| for line in tmpl_file: |
| if line.strip() == '@@@': |
| break |
| print(line, end='') |
| |
| print('/* total size: {} unique leaves: {} */\n'.format(total_leaves, len(leaves))) |
| |
| print('#define LEAF0 ({} * sizeof (FcLangCharSet))'.format(len(sets))) |
| print('#define OFF0 (LEAF0 + {} * sizeof (FcCharLeaf))'.format(len(leaves))) |
| print('#define NUM0 (OFF0 + {} * sizeof (uintptr_t))'.format(tn)) |
| print('#define SET(n) (n * sizeof (FcLangCharSet) + offsetof (FcLangCharSet, charset))') |
| print('#define OFF(s,o) (OFF0 + o * sizeof (uintptr_t) - SET(s))') |
| print('#define NUM(s,n) (NUM0 + n * sizeof (FcChar16) - SET(s))') |
| print('#define LEAF(o,l) (LEAF0 + l * sizeof (FcCharLeaf) - (OFF0 + o * sizeof (intptr_t)))') |
| print('#define fcLangCharSets (fcLangData.langCharSets)') |
| print('#define fcLangCharSetIndices (fcLangData.langIndices)') |
| print('#define fcLangCharSetIndicesInv (fcLangData.langIndicesInv)') |
| |
| assert len(sets) < 256 # FIXME: need to change index type to 16-bit below then |
| |
| print(''' |
| static const struct {{ |
| FcLangCharSet langCharSets[{}]; |
| FcCharLeaf leaves[{}]; |
| uintptr_t leaf_offsets[{}]; |
| FcChar16 numbers[{}]; |
| {} langIndices[{}]; |
| {} langIndicesInv[{}]; |
| }} fcLangData = {{'''.format(len(sets), len(leaves), tn, tn, |
| 'FcChar8 ', len(sets), 'FcChar8 ', len(sets))) |
| |
| # Dump sets |
| print('{') |
| for i, s in enumerate(sets): |
| if duplicate[i]: |
| j = duplicate[i] |
| else: |
| j = i |
| print(' {{ "{}", {{ FC_REF_CONSTANT, {}, OFF({},{}), NUM({},{}) }} }}, /* {} */'.format( |
| langs[i], len(sets[j].leaves), i, off[j], i, off[j], i)) |
| |
| print('},') |
| |
| # Dump leaves |
| print('{') |
| for l, leaf in enumerate(leaves): |
| print(' {{ {{ /* {} */'.format(l), end='') |
| for i in range(0, 8): # 256/32 = 8 |
| if i % 4 == 0: |
| print('\n ', end='') |
| print(' 0x{:08x},'.format(leaf[i]), end='') |
| print('\n } },') |
| print('},') |
| |
| # Dump leaves |
| print('{') |
| for i, s in enumerate(sets): |
| if duplicate[i]: |
| continue |
| |
| print(' /* {} */'.format(names[i])) |
| |
| for n, leaf_num in enumerate(sorted(s.leaves.keys())): |
| leaf = s.leaves[leaf_num] |
| if n % 4 == 0: |
| print(' ', end='') |
| found = [k for k, unique_leaf in enumerate(leaves) if leaves_equal(unique_leaf,leaf)] |
| assert found, "Couldn't find leaf in unique leaves list!" |
| assert len(found) == 1 |
| print(' LEAF({:3},{:3}),'.format(off[i], found[0]), end='') |
| if n % 4 == 3: |
| print('') |
| if len(s.leaves) % 4 != 0: |
| print('') |
| |
| print('},') |
| |
| print('{') |
| for i, s in enumerate(sets): |
| if duplicate[i]: |
| continue |
| |
| print(' /* {} */'.format(names[i])) |
| |
| for n, leaf_num in enumerate(sorted(s.leaves.keys())): |
| leaf = s.leaves[leaf_num] |
| if n % 8 == 0: |
| print(' ', end='') |
| print(' 0x{:04x},'.format(leaf_num), end='') |
| if n % 8 == 7: |
| print('') |
| if len(s.leaves) % 8 != 0: |
| print('') |
| |
| print('},') |
| |
| # langIndices |
| print('{') |
| for i, s in enumerate(sets): |
| fn = '{}.orth'.format(names[i]) |
| print(' {}, /* {} */'.format(orth_entries[fn], names[i])) |
| print('},') |
| |
| # langIndicesInv |
| print('{') |
| for i, k in enumerate(orth_entries.keys()): |
| name = get_name(k) |
| idx = names.index(name) |
| print(' {}, /* {} */'.format(idx, name)) |
| print('}') |
| |
| print('};\n') |
| |
| print('#define NUM_LANG_CHAR_SET {}'.format(len(sets))) |
| num_lang_set_map = (len(sets) + 31) // 32; |
| print('#define NUM_LANG_SET_MAP {}'.format(num_lang_set_map)) |
| |
| # Dump indices with country codes |
| assert len(country) > 0 |
| assert len(LangCountrySets) > 0 |
| print('') |
| print('static const FcChar32 fcLangCountrySets[][NUM_LANG_SET_MAP] = {') |
| for k in sorted(LangCountrySets.keys()): |
| langset_map = [0] * num_lang_set_map # initialise all zeros |
| for entries_id in LangCountrySets[k]: |
| langset_map[entries_id >> 5] |= (1 << (entries_id & 0x1f)) |
| print(' {', end='') |
| for v in langset_map: |
| print(' 0x{:08x},'.format(v), end='') |
| print(' }}, /* {} */'.format(k)) |
| |
| print('};\n') |
| print('#define NUM_COUNTRY_SET {}\n'.format(len(LangCountrySets))) |
| |
| # Find ranges for each letter for faster searching |
| # Dump sets start/finish for the fastpath |
| print('static const FcLangCharSetRange fcLangCharSetRanges[] = {\n') |
| for c in string.ascii_lowercase: # a-z |
| start = 9999 |
| stop = -1 |
| for i, s in enumerate(sets): |
| if names[i].startswith(c): |
| start = min(start,i) |
| stop = max(stop,i) |
| print(' {{ {}, {} }}, /* {} */'.format(start, stop, c)) |
| print('};\n') |
| |
| # And flush out the rest of the input file |
| for line in tmpl_file: |
| print(line, end='') |
| |
| sys.stdout.flush() |