tools/unicode/py/parsescriptmetadata.py - external/github.com/unicode-org/icu - Git at Google

 #!/usr/bin/python -B
 # -*- coding: utf-8 -*-
 #
 # Copyright (C) 2017 and later: Unicode, Inc. and others.
 # License & terms of use: http://www.unicode.org/copyright.html
 #
 # Copyright (c) 2013-2016 International Business Machines
 # Corporation and others. All Rights Reserved.
 #
 # parsescriptmetadata.py
 #
 # 2013feb15 Markus W. Scherer
 #
 # ./parsescriptmetadata.py
 #   ~/svn.icu/trunk/src/source/common/unicode/uscript.h
 #   ~/svn.cldr/trunk/common/properties/scriptMetadata.txt

 """Parses ICU4C uscript.h & CLDR scriptMetadata.txt,
 and writes ICU script data initializers."""

 import re
 import sys

 def main():
   if len(sys.argv) < 3:
     print ("Usage: {}  path/to/ICU4C/uscript.h  "
            "path/to/CLDR/scriptMetadata.txt".format(sys.argv[0]))
     return
   (uscript_path, smd_path) = sys.argv[1:3]

   iso_to_icu = {}
   max_icu_num = 0

   # Parse lines like
   #   USCRIPT_ARABIC       =  2,  /* Arab */
   # and extract the ICU numeric script code and the ISO script code.
   script_num_re = re.compile(r" *= *([0-9]+), */\* *([A-Z][a-z]{3}) *\*/")
   with open(uscript_path, "r") as uscript_file:
     for line in uscript_file:
       line = line.strip()
       if not line: continue
       if line.startswith("#"): continue  # whole-line comment
       match = script_num_re.search(line)
       if match:
         icu_num = int(match.group(1))
         iso_to_icu[match.group(2)] = icu_num
         if icu_num > max_icu_num: max_icu_num = icu_num

   icu_data = [None] * (max_icu_num + 1)

   # Parse lines like
   #   Arab; 8; 0628; SA; 1; RECOMMENDED; YES; NO; YES; NO; NO
   # and put the data (as strings) into the icu_data list.
   with open(smd_path, "r") as smd_file:
     for line in smd_file:
       comment_start = line.find("#")
       if comment_start >= 0: line = line[0:comment_start]
       line = line.strip()
       if not line: continue

       fields = line.split(";")
       if not fields or len(fields) < 11: continue
       iso_code = fields[0].strip()
       icu_num = iso_to_icu[iso_code]
       icu_data[icu_num] = (iso_code,
           # sample, usage
           fields[2].strip(), fields[5].strip(),
           # RTL, LB, cased
           fields[6].strip(), fields[7].strip(), fields[10].strip())

   # Print ICU array initializers with the relevant data.
   for t in icu_data:
     if t:
       (iso_code, sample, usage, rtl, lb, cased) = t
       s = "0x" + sample + " | " + usage
       if rtl == "YES": s += " | RTL"
       if lb == "YES": s += " | LB_LETTERS"
       if cased == "YES": s += " | CASED"
       print "    " + s + ",  // " + iso_code
     else:
       print "    0,"


 if __name__ == "__main__":
   main()
	#!/usr/bin/python -B
	# -- coding: utf-8 --
	#
	# Copyright (C) 2017 and later: Unicode, Inc. and others.
	# License & terms of use: http://www.unicode.org/copyright.html
	#
	# Copyright (c) 2013-2016 International Business Machines
	# Corporation and others. All Rights Reserved.
	#
	# parsescriptmetadata.py
	#
	# 2013feb15 Markus W. Scherer
	#
	# ./parsescriptmetadata.py
	# ~/svn.icu/trunk/src/source/common/unicode/uscript.h
	# ~/svn.cldr/trunk/common/properties/scriptMetadata.txt

	"""Parses ICU4C uscript.h & CLDR scriptMetadata.txt,
	and writes ICU script data initializers."""

	import re
	import sys

	def main():
	if len(sys.argv) < 3:
	print ("Usage: {} path/to/ICU4C/uscript.h "
	"path/to/CLDR/scriptMetadata.txt".format(sys.argv[0]))
	return
	(uscript_path, smd_path) = sys.argv[1:3]

	iso_to_icu = {}
	max_icu_num = 0

	# Parse lines like
	# USCRIPT_ARABIC = 2, /* Arab */
	# and extract the ICU numeric script code and the ISO script code.
	script_num_re = re.compile(r" = ([0-9]+), /\ ([A-Z][a-z]{3}) \*/")
	with open(uscript_path, "r") as uscript_file:
	for line in uscript_file:
	line = line.strip()
	if not line: continue
	if line.startswith("#"): continue # whole-line comment
	match = script_num_re.search(line)
	if match:
	icu_num = int(match.group(1))
	iso_to_icu[match.group(2)] = icu_num
	if icu_num > max_icu_num: max_icu_num = icu_num

	icu_data = [None] * (max_icu_num + 1)

	# Parse lines like
	# Arab; 8; 0628; SA; 1; RECOMMENDED; YES; NO; YES; NO; NO
	# and put the data (as strings) into the icu_data list.
	with open(smd_path, "r") as smd_file:
	for line in smd_file:
	comment_start = line.find("#")
	if comment_start >= 0: line = line[0:comment_start]
	line = line.strip()
	if not line: continue

	fields = line.split(";")
	if not fields or len(fields) < 11: continue
	iso_code = fields[0].strip()
	icu_num = iso_to_icu[iso_code]
	icu_data[icu_num] = (iso_code,
	# sample, usage
	fields[2].strip(), fields[5].strip(),
	# RTL, LB, cased
	fields[6].strip(), fields[7].strip(), fields[10].strip())

	# Print ICU array initializers with the relevant data.
	for t in icu_data:
	if t:
	(iso_code, sample, usage, rtl, lb, cased) = t
	s = "0x" + sample + " \| " + usage
	if rtl == "YES": s += " \| RTL"
	if lb == "YES": s += " \| LB_LETTERS"
	if cased == "YES": s += " \| CASED"
	print " " + s + ", // " + iso_code
	else:
	print " 0,"


	if __name__ == "__main__":
	main()