| #! /usr/bin/python -B |
| |
| # Copyright (C) 2016 and later: Unicode, Inc. and others. |
| # License & terms of use: http://www.unicode.org/copyright.html |
| |
| # Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others. |
| # All rights reserved. |
| |
| # |
| # Script to check that ICU source files contain only valid UTF-8 encoded text, |
| # and that all files except '.txt' files do not contain a Byte Order Mark (BOM). |
| # |
| # THIS SCRIPT DOES NOT WORK ON WINDOWS |
| # It only works correctly on platforms where the native line ending is a plain \n |
| # |
| # usage: |
| # icu-file-utf8-check.py [options] |
| # |
| # options: |
| # -h | --help Print a usage line and exit. |
| # |
| # The tool operates recursively on the directory from which it is run. |
| # Only files from the ICU github repository are checked. |
| # No changes are made to the repository; only the working copy will be altered. |
| # The script checks all source files and returns a non-zero exit code if any of |
| # the checked files contain a non-UTF-8 character. |
| |
| from __future__ import print_function |
| |
| import sys |
| import os |
| import os.path |
| import re |
| import getopt |
| |
| |
| # List of directories to check for UTF-8 and BOM. Currently covers |
| # all of icu/. Modify as needed. |
| icu_directories_to_be_scanned = ["."] |
| |
| # Files that are allowed to contain \r line endings. If this list |
| # grows too long consider a file instead. |
| ignore_cr_in_files = [ |
| "vendor/double-conversion/upstream/msvc/testrunner.cmd" |
| ] |
| |
| def runCommand(cmd): |
| output_file = os.popen(cmd); |
| output_text = output_file.read(); |
| exit_status = output_file.close(); |
| |
| return output_text, exit_status |
| |
| |
| def usage(): |
| print("usage: " + sys.argv[0] + " [-h | --help]") |
| |
| |
| # |
| # File check. Check source code files for UTF-8 and all except text files for not containing a BOM |
| # file_name: name of a text file. |
| # is_source: Flag, set to True if file is a source code file (.c, .cpp, .h, .java). |
| # |
| def check_file(file_name, is_source): |
| rc = 0 |
| f = open(file_name, 'rb') |
| bytes = f.read() |
| f.close() |
| |
| if is_source: |
| try: |
| bytes.decode("UTF-8") |
| except UnicodeDecodeError: |
| print("Error: %s is a source code file but contains non-utf-8 bytes." % file_name) |
| rc = 1 |
| |
| if bytes[0] == 0xef: |
| if not (file_name.endswith(".txt") or file_name.endswith(".sln") |
| or file_name.endswith(".targets") or ".vcxproj" in file_name): |
| print("Warning: file %s contains a UTF-8 BOM: " % file_name) |
| rc = 1 |
| |
| return rc |
| |
| def main(argv): |
| exit_status = 0 |
| rc = 0 |
| |
| try: |
| opts, args = getopt.getopt(argv, "h", ("help")) |
| except getopt.GetoptError: |
| print("unrecognized option: " + argv[0]) |
| usage() |
| sys.exit(2) |
| for opt, arg in opts: |
| if opt in ("-h", "--help"): |
| usage() |
| sys.exit() |
| if args: |
| print("unexpected command line argument") |
| usage() |
| sys.exit(2) |
| |
| source_file_re = re.compile(".*((?:\\.c$)|(?:\\.cpp$)|(?:\\.h$)|(?:\\.java$))") |
| git_cmd = "git ls-files DIR" |
| |
| for dir in icu_directories_to_be_scanned: |
| print('Scanning ' + dir) |
| cmd = git_cmd.replace("DIR", dir) |
| output, rc = runCommand(cmd) |
| if rc: |
| print('"', cmd, '" failed. Exiting.', file=sys.stderr) |
| file_list = output.splitlines() |
| |
| for f in file_list: |
| if os.path.isdir(f): |
| print("Skipping dir " + f) |
| continue |
| if not os.path.isfile(f): |
| print("Repository file not in working copy: " + f) |
| continue; |
| |
| source_file = source_file_re.match(f) |
| if check_file(f, source_file) != 0: |
| exit_status = 1 |
| |
| # Lastly, check the line endings of the file. |
| # Note that 'grep' returns null if it reports a file, |
| # a non-null value otherwise. |
| output, rc = runCommand("grep -rPIl \"\\r\" " + f) |
| if (rc is None): |
| if f not in ignore_cr_in_files: |
| print("File ", f, " has \\r line ending") |
| exit_status = 1 |
| |
| print(exit_status) |
| sys.exit(exit_status) |
| |
| if __name__ == "__main__": |
| main(sys.argv[1:]) |