unicode/c/genprops/misc/ucdcopy.py - external/github.com/unicode-org/icu - Git at Google

 #!/usr/bin/python2.4
 # Copyright (c) 2009-2010 International Business Machines
 # Corporation and others. All Rights Reserved.
 #
 #   file name:  ucdcopy.py
 #   encoding:   US-ASCII
 #   tab size:   8 (not used)
 #   indentation:4
 #
 #   created on: 2009aug04
 #   created by: Markus W. Scherer
 #
 # Copy Unicode Character Database (ucd) files from a tree
 # of files downloaded from ftp://www.unicode.org/Public/5.2.0/
 # to a folder like ICU's source/data/unidata/
 # and modify some of the files to make them more compact.
 #
 # Invoke with two command-line parameters, for the source
 # and destination folders.

 import os
 import os.path
 import re
 import shutil
 import sys

 _strip_re = re.compile("^([0-9a-fA-F]+.+?) *#.*")
 _code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;")

 def CopyAndStripWithOptionalMerge(s, t, do_merge):
   in_file = open(s, "r")
   out_file = open(t, "w")
   first = -1  # First code point with first_data.
   last = -1  # Last code point with first_data.
   first_data = ""  # Common data for code points [first..last].
   for line in in_file:
     match = _strip_re.match(line)
     if match:
       line = match.group(1)
     else:
       line = line.rstrip()
     if do_merge:
       match = _code_point_re.match(line)
       if match:
         c = int(match.group(1), 16)
         data = line[match.end() - 1:]
       else:
         c = -1
         data = ""
       if last >= 0 and (c != (last + 1) or data != first_data):
         # output the current range
         if first == last:
           out_file.write("%04X%s\n" % (first, first_data))
         else:
           out_file.write("%04X..%04X%s\n" % (first, last, first_data))
         first = -1
         last = -1
         first_data = ""
       if c < 0:
         # no data on this line, output as is
         out_file.write(line)
         out_file.write("\n")
       else:
         # data on this line, store for possible range compaction
         if last < 0:
           # set as the first line in a possible range
           first = c
           last = c
           first_data = data
         else:
           # must be c == (last + 1) and data == first_data
           # because of previous conditions
           # continue with the current range
           last = c
     else:
       # Only strip, don't merge: just output the stripped line.
       out_file.write(line)
       out_file.write("\n")
   if do_merge and last >= 0:
     # output the last range in the file
     if first == last:
       out_file.write("%04X%s\n" % (first, first_data))
     else:
       out_file.write("%04X..%04X%s\n" % (first, last, first_data))
     first = -1
     last = -1
     first_data = ""
   in_file.close()
   out_file.flush()
   out_file.close()


 def CopyAndStrip(s, t):
   """Copies a file and removes comments behind data lines but not in others."""
   CopyAndStripWithOptionalMerge(s, t, False)


 def CopyAndStripAndMerge(s, t):
   """Copies and strips a file and merges lines.

   Copies a file, removes comments, and
   merges lines with adjacent code point ranges and identical per-code point
   data lines into one line with range syntax.
   """
   CopyAndStripWithOptionalMerge(s, t, True)


 _files = {
   # Simply copy these files.
   "BidiMirroring.txt": shutil.copy,
   "BidiTest.txt": (shutil.copy, "testdata"),
   "Blocks.txt": shutil.copy,
   "CaseFolding.txt": shutil.copy,
   "DerivedAge.txt": shutil.copy,
   "DerivedBidiClass.txt": shutil.copy,
   "DerivedJoiningGroup.txt": shutil.copy,
   "DerivedJoiningType.txt": shutil.copy,
   "DerivedNumericValues.txt": shutil.copy,
   "GraphemeBreakTest.txt": (shutil.copy, "testdata"),
   "LineBreakTest.txt": (shutil.copy, "testdata"),
   "NameAliases.txt": shutil.copy,
   "NormalizationCorrections.txt": shutil.copy,
   "PropertyAliases.txt": shutil.copy,
   "PropertyValueAliases.txt": shutil.copy,
   "SentenceBreakTest.txt": (shutil.copy, "testdata"),
   "ScriptExtensions.txt": shutil.copy,
   "SpecialCasing.txt": shutil.copy,
   "UnicodeData.txt": shutil.copy,
   "WordBreakTest.txt": (shutil.copy, "testdata"),

   # Copy these files and remove comments behind data lines but not in others.
   "DerivedCoreProperties.txt": CopyAndStrip,
   "DerivedNormalizationProps.txt": CopyAndStrip,
   "GraphemeBreakProperty.txt": CopyAndStrip,
   "NormalizationTest.txt": CopyAndStrip,
   "PropList.txt": CopyAndStrip,
   "Scripts.txt": CopyAndStrip,
   "SentenceBreakProperty.txt": CopyAndStrip,
   "WordBreakProperty.txt": CopyAndStrip,

   # Also merge lines with adjacent code point ranges.
   "EastAsianWidth.txt": CopyAndStripAndMerge,
   "LineBreak.txt": CopyAndStripAndMerge
 }

 _file_version_re = re.compile("^([a-zA-Z0-9]+)" +
                               "-[0-9](?:\\.[0-9])*(?:d[0-9]+)?" +
                               "(\\.[a-z]+)$")

 def main():
   source_root = sys.argv[1]
   dest_root = sys.argv[2]
   source_files = []
   for root, dirs, files in os.walk(source_root):
     for file in files:
       source_files.append(os.path.join(root, file))
   files_processed = set()
   for source_file in source_files:
     basename = os.path.basename(source_file)
     match = _file_version_re.match(basename)
     if match:
       basename = match.group(1) + match.group(2)
       print basename
     if basename in _files:
       if basename in files_processed:
         print "duplicate file basename %s!" % basename
         sys.exit(1)
       files_processed.add(basename)
       action = _files[basename]
       if isinstance(action, tuple):
         dest_folder = action[1]
         action = action[0]
       else:
         dest_folder = "unidata"
       dest_path = os.path.join(dest_root, dest_folder)
       if not os.path.exists(dest_path): os.makedirs(dest_path)
       dest_file = os.path.join(dest_path, basename)
       action(source_file, dest_file)


 if __name__ == "__main__":
   main()
	#!/usr/bin/python2.4
	# Copyright (c) 2009-2010 International Business Machines
	# Corporation and others. All Rights Reserved.
	#
	# file name: ucdcopy.py
	# encoding: US-ASCII
	# tab size: 8 (not used)
	# indentation:4
	#
	# created on: 2009aug04
	# created by: Markus W. Scherer
	#
	# Copy Unicode Character Database (ucd) files from a tree
	# of files downloaded from ftp://www.unicode.org/Public/5.2.0/
	# to a folder like ICU's source/data/unidata/
	# and modify some of the files to make them more compact.
	#
	# Invoke with two command-line parameters, for the source
	# and destination folders.

	import os
	import os.path
	import re
	import shutil
	import sys

	_strip_re = re.compile("^([0-9a-fA-F]+.+?) #.")
	_code_point_re = re.compile("\s([0-9a-fA-F]+)\s;")

	def CopyAndStripWithOptionalMerge(s, t, do_merge):
	in_file = open(s, "r")
	out_file = open(t, "w")
	first = -1 # First code point with first_data.
	last = -1 # Last code point with first_data.
	first_data = "" # Common data for code points [first..last].
	for line in in_file:
	match = _strip_re.match(line)
	if match:
	line = match.group(1)
	else:
	line = line.rstrip()
	if do_merge:
	match = _code_point_re.match(line)
	if match:
	c = int(match.group(1), 16)
	data = line[match.end() - 1:]
	else:
	c = -1
	data = ""
	if last >= 0 and (c != (last + 1) or data != first_data):
	# output the current range
	if first == last:
	out_file.write("%04X%s\n" % (first, first_data))
	else:
	out_file.write("%04X..%04X%s\n" % (first, last, first_data))
	first = -1
	last = -1
	first_data = ""
	if c < 0:
	# no data on this line, output as is
	out_file.write(line)
	out_file.write("\n")
	else:
	# data on this line, store for possible range compaction
	if last < 0:
	# set as the first line in a possible range
	first = c
	last = c
	first_data = data
	else:
	# must be c == (last + 1) and data == first_data
	# because of previous conditions
	# continue with the current range
	last = c
	else:
	# Only strip, don't merge: just output the stripped line.
	out_file.write(line)
	out_file.write("\n")
	if do_merge and last >= 0:
	# output the last range in the file
	if first == last:
	out_file.write("%04X%s\n" % (first, first_data))
	else:
	out_file.write("%04X..%04X%s\n" % (first, last, first_data))
	first = -1
	last = -1
	first_data = ""
	in_file.close()
	out_file.flush()
	out_file.close()


	def CopyAndStrip(s, t):
	"""Copies a file and removes comments behind data lines but not in others."""
	CopyAndStripWithOptionalMerge(s, t, False)


	def CopyAndStripAndMerge(s, t):
	"""Copies and strips a file and merges lines.

	Copies a file, removes comments, and
	merges lines with adjacent code point ranges and identical per-code point
	data lines into one line with range syntax.
	"""
	CopyAndStripWithOptionalMerge(s, t, True)


	_files = {
	# Simply copy these files.
	"BidiMirroring.txt": shutil.copy,
	"BidiTest.txt": (shutil.copy, "testdata"),
	"Blocks.txt": shutil.copy,
	"CaseFolding.txt": shutil.copy,
	"DerivedAge.txt": shutil.copy,
	"DerivedBidiClass.txt": shutil.copy,
	"DerivedJoiningGroup.txt": shutil.copy,
	"DerivedJoiningType.txt": shutil.copy,
	"DerivedNumericValues.txt": shutil.copy,
	"GraphemeBreakTest.txt": (shutil.copy, "testdata"),
	"LineBreakTest.txt": (shutil.copy, "testdata"),
	"NameAliases.txt": shutil.copy,
	"NormalizationCorrections.txt": shutil.copy,
	"PropertyAliases.txt": shutil.copy,
	"PropertyValueAliases.txt": shutil.copy,
	"SentenceBreakTest.txt": (shutil.copy, "testdata"),
	"ScriptExtensions.txt": shutil.copy,
	"SpecialCasing.txt": shutil.copy,
	"UnicodeData.txt": shutil.copy,
	"WordBreakTest.txt": (shutil.copy, "testdata"),

	# Copy these files and remove comments behind data lines but not in others.
	"DerivedCoreProperties.txt": CopyAndStrip,
	"DerivedNormalizationProps.txt": CopyAndStrip,
	"GraphemeBreakProperty.txt": CopyAndStrip,
	"NormalizationTest.txt": CopyAndStrip,
	"PropList.txt": CopyAndStrip,
	"Scripts.txt": CopyAndStrip,
	"SentenceBreakProperty.txt": CopyAndStrip,
	"WordBreakProperty.txt": CopyAndStrip,

	# Also merge lines with adjacent code point ranges.
	"EastAsianWidth.txt": CopyAndStripAndMerge,
	"LineBreak.txt": CopyAndStripAndMerge
	}

	_file_version_re = re.compile("^([a-zA-Z0-9]+)" +
	"-[0-9](?:\\.[0-9])*(?:d[0-9]+)?" +
	"(\\.[a-z]+)$")

	def main():
	source_root = sys.argv[1]
	dest_root = sys.argv[2]
	source_files = []
	for root, dirs, files in os.walk(source_root):
	for file in files:
	source_files.append(os.path.join(root, file))
	files_processed = set()
	for source_file in source_files:
	basename = os.path.basename(source_file)
	match = _file_version_re.match(basename)
	if match:
	basename = match.group(1) + match.group(2)
	print basename
	if basename in _files:
	if basename in files_processed:
	print "duplicate file basename %s!" % basename
	sys.exit(1)
	files_processed.add(basename)
	action = _files[basename]
	if isinstance(action, tuple):
	dest_folder = action[1]
	action = action[0]
	else:
	dest_folder = "unidata"
	dest_path = os.path.join(dest_root, dest_folder)
	if not os.path.exists(dest_path): os.makedirs(dest_path)
	dest_file = os.path.join(dest_path, basename)
	action(source_file, dest_file)


	if __name__ == "__main__":
	main()