tools/scripts/icu-file-utf8-check.py - external/github.com/unicode-org/icu - Git at Google

 #! /usr/bin/python -B

 # Copyright (C) 2016 and later: Unicode, Inc. and others.
 # License & terms of use: http://www.unicode.org/copyright.html

 # Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others.
 # All rights reserved.

 #
 #  Script to check that ICU source files contain only valid UTF-8 encoded text,
 #  and that all files except '.txt' files do not contain a Byte Order Mark (BOM).
 #
 #  THIS SCRIPT DOES NOT WORK ON WINDOWS
 #     It only works correctly on platforms where the native line ending is a plain \n
 #
 #  usage:
 #     icu-file-utf8-check.py  [options]
 #
 #  options:
 #     -h | --help    Print a usage line and exit.
 #
 #  The tool operates recursively on the directory from which it is run.
 #  Only files from the ICU github repository are checked.
 #  No changes are made to the repository; only the working copy will be altered.
 #  The script checks all source files and returns a non-zero exit code if any of
 #  the checked files contain a non-UTF-8 character.

 from __future__ import print_function

 import sys
 import os
 import os.path
 import re
 import getopt


 # List of directories to check for UTF-8 and BOM. Currently covers
 # all of icu/. Modify as needed.
 icu_directories_to_be_scanned = ["."]

 # Files that are allowed to contain \r line endings. If this list
 # grows too long consider a file instead.
 ignore_cr_in_files = [
     "vendor/double-conversion/upstream/msvc/testrunner.cmd"
     ]

 def runCommand(cmd):
     output_file = os.popen(cmd);
     output_text = output_file.read();
     exit_status = output_file.close();

     return output_text, exit_status


 def usage():
     print("usage: " + sys.argv[0] + " [-h | --help]")


 #
 #  File check.         Check source code files for UTF-8 and all except text files for not containing a BOM
 #    file_name:        name of a text file.
 #    is_source:        Flag, set to True if file is a source code file (.c, .cpp, .h, .java).
 #
 def check_file(file_name, is_source):
     rc = 0
     f = open(file_name, 'rb')
     bytes = f.read()
     f.close()

     if is_source:
         try:
             bytes.decode("UTF-8")
         except UnicodeDecodeError:
             print("Error: %s is a source code file but contains non-utf-8 bytes." % file_name)
             rc = 1

     if bytes[0] == 0xef:
         if not (file_name.endswith(".txt") or file_name.endswith(".sln")
                     or file_name.endswith(".targets") or ".vcxproj" in file_name):
             print("Warning: file %s contains a UTF-8 BOM: " % file_name)
             rc = 1

     return rc

 def main(argv):
     exit_status = 0
     rc = 0

     try:
         opts, args = getopt.getopt(argv, "h", ("help"))
     except getopt.GetoptError:
         print("unrecognized option: " + argv[0])
         usage()
         sys.exit(2)
     for opt, arg in opts:
         if opt in ("-h", "--help"):
             usage()
             sys.exit()
     if args:
         print("unexpected command line argument")
         usage()
         sys.exit(2)

     source_file_re = re.compile(".*((?:\\.c$)|(?:\\.cpp$)|(?:\\.h$)|(?:\\.java$))")
     git_cmd = "git ls-files DIR"

     for dir in icu_directories_to_be_scanned:
         print('Scanning ' + dir)
         cmd = git_cmd.replace("DIR", dir)
         output, rc = runCommand(cmd)
         if rc:
             print('"', cmd, '" failed. Exiting.', file=sys.stderr)
         file_list = output.splitlines()

         for f in file_list:
             if os.path.isdir(f):
                 print("Skipping dir " + f)
                 continue
             if not os.path.isfile(f):
                 print("Repository file not in working copy: " + f)
                 continue;

             source_file = source_file_re.match(f)
             if check_file(f, source_file) != 0:
                 exit_status = 1

             # Lastly, check the line endings of the file.
             # Note that 'grep' returns null if it reports a file,
             # a non-null value otherwise.
             output, rc = runCommand("grep -rPIl \"\\r\" " + f)
             if (rc is None):
                 if f not in ignore_cr_in_files:
                     print("File ", f, " has \\r line ending")
                     exit_status = 1

     print(exit_status)
     sys.exit(exit_status)

 if __name__ == "__main__":
     main(sys.argv[1:])
	#! /usr/bin/python -B

	# Copyright (C) 2016 and later: Unicode, Inc. and others.
	# License & terms of use: http://www.unicode.org/copyright.html

	# Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others.
	# All rights reserved.

	#
	# Script to check that ICU source files contain only valid UTF-8 encoded text,
	# and that all files except '.txt' files do not contain a Byte Order Mark (BOM).
	#
	# THIS SCRIPT DOES NOT WORK ON WINDOWS
	# It only works correctly on platforms where the native line ending is a plain \n
	#
	# usage:
	# icu-file-utf8-check.py [options]
	#
	# options:
	# -h \| --help Print a usage line and exit.
	#
	# The tool operates recursively on the directory from which it is run.
	# Only files from the ICU github repository are checked.
	# No changes are made to the repository; only the working copy will be altered.
	# The script checks all source files and returns a non-zero exit code if any of
	# the checked files contain a non-UTF-8 character.

	from __future__ import print_function

	import sys
	import os
	import os.path
	import re
	import getopt


	# List of directories to check for UTF-8 and BOM. Currently covers
	# all of icu/. Modify as needed.
	icu_directories_to_be_scanned = ["."]

	# Files that are allowed to contain \r line endings. If this list
	# grows too long consider a file instead.
	ignore_cr_in_files = [
	"vendor/double-conversion/upstream/msvc/testrunner.cmd"
	]

	def runCommand(cmd):
	output_file = os.popen(cmd);
	output_text = output_file.read();
	exit_status = output_file.close();

	return output_text, exit_status


	def usage():
	print("usage: " + sys.argv[0] + " [-h \| --help]")


	#
	# File check. Check source code files for UTF-8 and all except text files for not containing a BOM
	# file_name: name of a text file.
	# is_source: Flag, set to True if file is a source code file (.c, .cpp, .h, .java).
	#
	def check_file(file_name, is_source):
	rc = 0
	f = open(file_name, 'rb')
	bytes = f.read()
	f.close()

	if is_source:
	try:
	bytes.decode("UTF-8")
	except UnicodeDecodeError:
	print("Error: %s is a source code file but contains non-utf-8 bytes." % file_name)
	rc = 1

	if bytes[0] == 0xef:
	if not (file_name.endswith(".txt") or file_name.endswith(".sln")
	or file_name.endswith(".targets") or ".vcxproj" in file_name):
	print("Warning: file %s contains a UTF-8 BOM: " % file_name)
	rc = 1

	return rc

	def main(argv):
	exit_status = 0
	rc = 0

	try:
	opts, args = getopt.getopt(argv, "h", ("help"))
	except getopt.GetoptError:
	print("unrecognized option: " + argv[0])
	usage()
	sys.exit(2)
	for opt, arg in opts:
	if opt in ("-h", "--help"):
	usage()
	sys.exit()
	if args:
	print("unexpected command line argument")
	usage()
	sys.exit(2)

	source_file_re = re.compile(".*((?:\\.c$)\|(?:\\.cpp$)\|(?:\\.h$)\|(?:\\.java$))")
	git_cmd = "git ls-files DIR"

	for dir in icu_directories_to_be_scanned:
	print('Scanning ' + dir)
	cmd = git_cmd.replace("DIR", dir)
	output, rc = runCommand(cmd)
	if rc:
	print('"', cmd, '" failed. Exiting.', file=sys.stderr)
	file_list = output.splitlines()

	for f in file_list:
	if os.path.isdir(f):
	print("Skipping dir " + f)
	continue
	if not os.path.isfile(f):
	print("Repository file not in working copy: " + f)
	continue;

	source_file = source_file_re.match(f)
	if check_file(f, source_file) != 0:
	exit_status = 1

	# Lastly, check the line endings of the file.
	# Note that 'grep' returns null if it reports a file,
	# a non-null value otherwise.
	output, rc = runCommand("grep -rPIl \"\\r\" " + f)
	if (rc is None):
	if f not in ignore_cr_in_files:
	print("File ", f, " has \\r line ending")
	exit_status = 1

	print(exit_status)
	sys.exit(exit_status)

	if __name__ == "__main__":
	main(sys.argv[1:])