| #!/usr/bin/env python3 |
| # |
| # fontconfig/fc-case/fc-case.py |
| # |
| # Copyright © 2004 Keith Packard |
| # Copyright © 2019 Tim-Philipp Müller |
| # |
| # Permission to use, copy, modify, distribute, and sell this software and its |
| # documentation for any purpose is hereby granted without fee, provided that |
| # the above copyright notice appear in all copies and that both that |
| # copyright notice and this permission notice appear in supporting |
| # documentation, and that the name of the author(s) not be used in |
| # advertising or publicity pertaining to distribution of the software without |
| # specific, written prior permission. The authors make no |
| # representations about the suitability of this software for any purpose. It |
| # is provided "as is" without express or implied warranty. |
| # |
| # THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, |
| # INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO |
| # EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR |
| # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, |
| # DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER |
| # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR |
| # PERFORMANCE OF THIS SOFTWARE. |
| |
| from enum import Enum |
| import argparse |
| import string |
| import sys |
| |
| class CaseFoldClass(Enum): |
| COMMON = 1 |
| FULL = 2 |
| SIMPLE = 3 |
| TURKIC = 4 |
| |
| class CaseFoldMethod(Enum): |
| RANGE = 0 |
| EVEN_ODD = 1 |
| FULL = 2 |
| |
| caseFoldClassMap = { |
| 'C' : CaseFoldClass.COMMON, |
| 'F' : CaseFoldClass.FULL, |
| 'S' : CaseFoldClass.SIMPLE, |
| 'T' : CaseFoldClass.TURKIC |
| } |
| |
| folds = [] |
| |
| def ucs4_to_utf8(ucs4): |
| utf8_rep = [] |
| |
| if ucs4 < 0x80: |
| utf8_rep.append(ucs4) |
| bits = -6 |
| elif ucs4 < 0x800: |
| utf8_rep.append(((ucs4 >> 6) & 0x1F) | 0xC0) |
| bits = 0 |
| elif ucs4 < 0x10000: |
| utf8_rep.append(((ucs4 >> 12) & 0x0F) | 0xE0) |
| bits = 6 |
| elif ucs4 < 0x200000: |
| utf8_rep.append(((ucs4 >> 18) & 0x07) | 0xF0) |
| bits = 12 |
| elif ucs4 < 0x4000000: |
| utf8_rep.append(((ucs4 >> 24) & 0x03) | 0xF8) |
| bits = 18 |
| elif ucs4 < 0x80000000: |
| utf8_rep.append(((ucs4 >> 30) & 0x01) | 0xFC) |
| bits = 24 |
| else: |
| return []; |
| |
| while bits >= 0: |
| utf8_rep.append(((ucs4 >> bits) & 0x3F) | 0x80) |
| bits-= 6 |
| |
| return utf8_rep |
| |
| def utf8_size(ucs4): |
| return len(ucs4_to_utf8(ucs4)) |
| |
| case_fold_method_name_map = { |
| CaseFoldMethod.RANGE: 'FC_CASE_FOLD_RANGE,', |
| CaseFoldMethod.EVEN_ODD: 'FC_CASE_FOLD_EVEN_ODD,', |
| CaseFoldMethod.FULL: 'FC_CASE_FOLD_FULL,', |
| } |
| |
| if __name__=='__main__': |
| parser = argparse.ArgumentParser() |
| parser.add_argument('case_folding_file') |
| parser.add_argument('--template', dest='template_file', default=None) |
| parser.add_argument('--output', dest='output_file', default=None) |
| |
| args = parser.parse_args() |
| |
| minFoldChar = None |
| maxFoldChar = None |
| fold = None |
| |
| foldChars = [] |
| maxFoldChars = 0 |
| |
| maxExpand = 0 |
| |
| # Read the standard Unicode CaseFolding.txt file |
| with open(args.case_folding_file, 'r', encoding='utf-8') as casefile: |
| for cnt, line in enumerate(casefile): |
| if not line or not line[0] in string.hexdigits: |
| continue |
| |
| # print('Line {}: {}'.format(cnt, line.strip())) |
| |
| tokens = line.split('; ') |
| |
| if len(tokens) < 3: |
| print('Not enough tokens in line {}'.format(cnt), file=sys.stderr) |
| sys.exit(1) |
| |
| # Get upper case value |
| upper = int(tokens.pop(0), 16) |
| |
| # Get class |
| cfclass = caseFoldClassMap[tokens.pop(0)] |
| |
| # Get list of result characters |
| lower = list(map(lambda s: int(s,16), tokens.pop(0).split())) |
| |
| # print('\t----> {:04X} {} {}'.format(upper, cfclass, lower)) |
| |
| if not minFoldChar: |
| minFoldChar = upper |
| |
| maxFoldChar = upper; |
| |
| if cfclass in [CaseFoldClass.COMMON, CaseFoldClass.FULL]: |
| if len(lower) == 1: |
| # foldExtends |
| if fold and fold['method'] == CaseFoldMethod.RANGE: |
| foldExtends = (lower[0] - upper) == fold['offset'] and upper == fold['upper'] + fold['count'] |
| elif fold and fold['method'] == CaseFoldMethod.EVEN_ODD: |
| foldExtends = (lower[0] - upper) == 1 and upper == (fold['upper'] + fold['count'] + 1) |
| else: |
| foldExtends = False |
| |
| if foldExtends: |
| # This modifies the last fold item in the array too |
| fold['count'] = upper - fold['upper'] + 1; |
| else: |
| fold = {} |
| fold['upper'] = upper |
| fold['offset'] = lower[0] - upper; |
| if fold['offset'] == 1: |
| fold['method'] = CaseFoldMethod.EVEN_ODD |
| else: |
| fold['method'] = CaseFoldMethod.RANGE |
| fold['count'] = 1 |
| folds.append(fold) |
| expand = utf8_size (lower[0]) - utf8_size(upper) |
| else: |
| fold = {} |
| fold['upper'] = upper |
| fold['method'] = CaseFoldMethod.FULL |
| fold['offset'] = len(foldChars) |
| |
| # add chars |
| for c in lower: |
| utf8_rep = ucs4_to_utf8(c) |
| # print('{} -> {}'.format(c,utf8_rep)) |
| for utf8_char in utf8_rep: |
| foldChars.append(utf8_char) |
| |
| fold['count'] = len(foldChars) - fold['offset'] |
| folds.append(fold) |
| |
| if fold['count'] > maxFoldChars: |
| maxFoldChars = fold['count'] |
| |
| expand = fold['count'] - utf8_size(upper) |
| if expand > maxExpand: |
| maxExpand = expand |
| |
| # Open output file |
| if args.output_file: |
| sys.stdout = open(args.output_file, 'w', encoding='utf-8') |
| |
| # Read the template file |
| if args.template_file: |
| tmpl_file = open(args.template_file, 'r', encoding='utf-8') |
| else: |
| tmpl_file = sys.stdin |
| |
| # Scan the input until the marker is found |
| # FIXME: this is a bit silly really, might just as well harcode |
| # the license header in the script and drop the template |
| for line in tmpl_file: |
| if line.strip() == '@@@': |
| break |
| print(line, end='') |
| |
| # Dump these tables |
| print('#define FC_NUM_CASE_FOLD\t{}'.format(len(folds))) |
| print('#define FC_NUM_CASE_FOLD_CHARS\t{}'.format(len(foldChars))) |
| print('#define FC_MAX_CASE_FOLD_CHARS\t{}'.format(maxFoldChars)) |
| print('#define FC_MAX_CASE_FOLD_EXPAND\t{}'.format(maxExpand)) |
| print('#define FC_MIN_FOLD_CHAR\t0x{:08x}'.format(minFoldChar)) |
| print('#define FC_MAX_FOLD_CHAR\t0x{:08x}'.format(maxFoldChar)) |
| print('') |
| |
| # Dump out ranges |
| print('static const FcCaseFold fcCaseFold[FC_NUM_CASE_FOLD] = {') |
| for f in folds: |
| short_offset = f['offset'] |
| if short_offset < -32367: |
| short_offset += 65536 |
| if short_offset > 32368: |
| short_offset -= 65536 |
| print(' {} 0x{:08x}, {:22s} 0x{:04x}, {:6d} {},'.format('{', |
| f['upper'], case_fold_method_name_map[f['method']], |
| f['count'], short_offset, '}')) |
| print('};\n') |
| |
| # Dump out "other" values |
| print('static const FcChar8\tfcCaseFoldChars[FC_NUM_CASE_FOLD_CHARS] = {') |
| for n, c in enumerate(foldChars): |
| if n == len(foldChars) - 1: |
| end = '' |
| elif n % 16 == 15: |
| end = ',\n' |
| else: |
| end = ',' |
| print('0x{:02x}'.format(c), end=end) |
| print('\n};') |
| |
| # And flush out the rest of the input file |
| for line in tmpl_file: |
| print(line, end='') |
| |
| sys.stdout.flush() |