| #!/usr/bin/python2.4 |
| # Copyright (c) 2009-2010 International Business Machines |
| # Corporation and others. All Rights Reserved. |
| # |
| # file name: ucdcopy.py |
| # encoding: US-ASCII |
| # tab size: 8 (not used) |
| # indentation:4 |
| # |
| # created on: 2009aug04 |
| # created by: Markus W. Scherer |
| # |
| # Copy Unicode Character Database (ucd) files from a tree |
| # of files downloaded from ftp://www.unicode.org/Public/5.2.0/ |
| # to a folder like ICU's source/data/unidata/ |
| # and modify some of the files to make them more compact. |
| # |
| # Invoke with two command-line parameters, for the source |
| # and destination folders. |
| |
| import os |
| import os.path |
| import re |
| import shutil |
| import sys |
| |
| _strip_re = re.compile("^([0-9a-fA-F]+.+?) *#.*") |
| _code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;") |
| |
| def CopyAndStripWithOptionalMerge(s, t, do_merge): |
| in_file = open(s, "r") |
| out_file = open(t, "w") |
| first = -1 # First code point with first_data. |
| last = -1 # Last code point with first_data. |
| first_data = "" # Common data for code points [first..last]. |
| for line in in_file: |
| match = _strip_re.match(line) |
| if match: |
| line = match.group(1) |
| else: |
| line = line.rstrip() |
| if do_merge: |
| match = _code_point_re.match(line) |
| if match: |
| c = int(match.group(1), 16) |
| data = line[match.end() - 1:] |
| else: |
| c = -1 |
| data = "" |
| if last >= 0 and (c != (last + 1) or data != first_data): |
| # output the current range |
| if first == last: |
| out_file.write("%04X%s\n" % (first, first_data)) |
| else: |
| out_file.write("%04X..%04X%s\n" % (first, last, first_data)) |
| first = -1 |
| last = -1 |
| first_data = "" |
| if c < 0: |
| # no data on this line, output as is |
| out_file.write(line) |
| out_file.write("\n") |
| else: |
| # data on this line, store for possible range compaction |
| if last < 0: |
| # set as the first line in a possible range |
| first = c |
| last = c |
| first_data = data |
| else: |
| # must be c == (last + 1) and data == first_data |
| # because of previous conditions |
| # continue with the current range |
| last = c |
| else: |
| # Only strip, don't merge: just output the stripped line. |
| out_file.write(line) |
| out_file.write("\n") |
| if do_merge and last >= 0: |
| # output the last range in the file |
| if first == last: |
| out_file.write("%04X%s\n" % (first, first_data)) |
| else: |
| out_file.write("%04X..%04X%s\n" % (first, last, first_data)) |
| first = -1 |
| last = -1 |
| first_data = "" |
| in_file.close() |
| out_file.flush() |
| out_file.close() |
| |
| |
| def CopyAndStrip(s, t): |
| """Copies a file and removes comments behind data lines but not in others.""" |
| CopyAndStripWithOptionalMerge(s, t, False) |
| |
| |
| def CopyAndStripAndMerge(s, t): |
| """Copies and strips a file and merges lines. |
| |
| Copies a file, removes comments, and |
| merges lines with adjacent code point ranges and identical per-code point |
| data lines into one line with range syntax. |
| """ |
| CopyAndStripWithOptionalMerge(s, t, True) |
| |
| |
| _files = { |
| # Simply copy these files. |
| "BidiMirroring.txt": shutil.copy, |
| "BidiTest.txt": (shutil.copy, "testdata"), |
| "Blocks.txt": shutil.copy, |
| "CaseFolding.txt": shutil.copy, |
| "DerivedAge.txt": shutil.copy, |
| "DerivedBidiClass.txt": shutil.copy, |
| "DerivedJoiningGroup.txt": shutil.copy, |
| "DerivedJoiningType.txt": shutil.copy, |
| "DerivedNumericValues.txt": shutil.copy, |
| "GraphemeBreakTest.txt": (shutil.copy, "testdata"), |
| "LineBreakTest.txt": (shutil.copy, "testdata"), |
| "NameAliases.txt": shutil.copy, |
| "NormalizationCorrections.txt": shutil.copy, |
| "PropertyAliases.txt": shutil.copy, |
| "PropertyValueAliases.txt": shutil.copy, |
| "SentenceBreakTest.txt": (shutil.copy, "testdata"), |
| "ScriptExtensions.txt": shutil.copy, |
| "SpecialCasing.txt": shutil.copy, |
| "UnicodeData.txt": shutil.copy, |
| "WordBreakTest.txt": (shutil.copy, "testdata"), |
| |
| # Copy these files and remove comments behind data lines but not in others. |
| "DerivedCoreProperties.txt": CopyAndStrip, |
| "DerivedNormalizationProps.txt": CopyAndStrip, |
| "GraphemeBreakProperty.txt": CopyAndStrip, |
| "NormalizationTest.txt": CopyAndStrip, |
| "PropList.txt": CopyAndStrip, |
| "Scripts.txt": CopyAndStrip, |
| "SentenceBreakProperty.txt": CopyAndStrip, |
| "WordBreakProperty.txt": CopyAndStrip, |
| |
| # Also merge lines with adjacent code point ranges. |
| "EastAsianWidth.txt": CopyAndStripAndMerge, |
| "LineBreak.txt": CopyAndStripAndMerge |
| } |
| |
| _file_version_re = re.compile("^([a-zA-Z0-9]+)" + |
| "-[0-9](?:\\.[0-9])*(?:d[0-9]+)?" + |
| "(\\.[a-z]+)$") |
| |
| def main(): |
| source_root = sys.argv[1] |
| dest_root = sys.argv[2] |
| source_files = [] |
| for root, dirs, files in os.walk(source_root): |
| for file in files: |
| source_files.append(os.path.join(root, file)) |
| files_processed = set() |
| for source_file in source_files: |
| basename = os.path.basename(source_file) |
| match = _file_version_re.match(basename) |
| if match: |
| basename = match.group(1) + match.group(2) |
| print basename |
| if basename in _files: |
| if basename in files_processed: |
| print "duplicate file basename %s!" % basename |
| sys.exit(1) |
| files_processed.add(basename) |
| action = _files[basename] |
| if isinstance(action, tuple): |
| dest_folder = action[1] |
| action = action[0] |
| else: |
| dest_folder = "unidata" |
| dest_path = os.path.join(dest_root, dest_folder) |
| if not os.path.exists(dest_path): os.makedirs(dest_path) |
| dest_file = os.path.join(dest_path, basename) |
| action(source_file, dest_file) |
| |
| |
| if __name__ == "__main__": |
| main() |