ICU-21248 Adds source file check (UTF-8 and absence of BOM) to
Travis Continued Integration.
ICU-21248 Factors in review comments.
ICU-21248 Changes: no more filtering of markdown files, moved the
script to icu/tools/script/ directory, removed BOM from one
README.md file.
ICU-21248 Adjusts path to the icu-file-utf8-check script.
ICU-21248 Extends coverage of UTF-8/BOM check to all of icu/ directory.
diff --git a/.travis.yml b/.travis.yml
index 9fe2453..ca33b5b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -195,3 +195,9 @@
- cd icu4c/source
script:
- test/hdrtst/testinternalheaders.sh
+
+ # Check source files for valid UTF-8 and for absence of BOM.
+ - name: "UTF-8 and BOM check"
+ os: linux
+ script:
+ - tools/scripts/icu-file-utf8-check.py
diff --git a/icu4c/source/test/testdata/break_rules/README.md b/icu4c/source/test/testdata/break_rules/README.md
index d2501c3..1deb4df 100644
--- a/icu4c/source/test/testdata/break_rules/README.md
+++ b/icu4c/source/test/testdata/break_rules/README.md
@@ -1,4 +1,4 @@
-<!--
+<!--
Copyright (C) 2016 and later: Unicode, Inc. and others.
License & terms of use: http://www.unicode.org/copyright.html
diff --git a/icu4c/source/tools/icu-file-utf8-check.py b/tools/scripts/icu-file-utf8-check.py
similarity index 69%
rename from icu4c/source/tools/icu-file-utf8-check.py
rename to tools/scripts/icu-file-utf8-check.py
index 86de259..9e30e3b 100755
--- a/icu4c/source/tools/icu-file-utf8-check.py
+++ b/tools/scripts/icu-file-utf8-check.py
@@ -22,6 +22,8 @@
# The tool operates recursively on the directory from which it is run.
# Only files from the ICU github repository are checked.
# No changes are made to the repository; only the working copy will be altered.
+# The script checks all source files and returns a non-zero exit code if any of
+# the checked files contain a non-UTF-8 character.
from __future__ import print_function
@@ -32,6 +34,10 @@
import getopt
+# List of directories to check for UTF-8 and BOM. Currently covers
+# all of icu/. Modify as needed.
+icu_directories_to_be_scanned = ["."]
+
def runCommand(cmd):
output_file = os.popen(cmd);
output_text = output_file.read();
@@ -45,13 +51,14 @@
def usage():
print("usage: " + sys.argv[0] + " [-h | --help]")
-
+
#
# File check. Check source code files for UTF-8 and all except text files for not containing a BOM
# file_name: name of a text file.
# is_source: Flag, set to True if file is a source code file (.c, .cpp, .h, .java).
#
def check_file(file_name, is_source):
+ rc = 0
f = open(file_name, 'rb')
bytes = f.read()
f.close()
@@ -61,16 +68,19 @@
bytes.decode("UTF-8")
except UnicodeDecodeError:
print("Error: %s is a source code file but contains non-utf-8 bytes." % file_name)
-
+ rc = 1
+
if bytes[0] == 0xef:
if not (file_name.endswith(".txt") or file_name.endswith(".sln")
- or file_name.endswith(".targets")
- or ".vcxproj" in file_name):
+ or file_name.endswith(".targets") or ".vcxproj" in file_name):
print("Warning: file %s contains a UTF-8 BOM: " % file_name)
+ rc = 1
- return
+ return rc
def main(argv):
+ exit_status = 0
+
try:
opts, args = getopt.getopt(argv, "h", ("help"))
except getopt.GetoptError:
@@ -84,23 +94,30 @@
if args:
print("unexpected command line argument")
usage()
- sys.exit()
-
- output = runCommand("git ls-files ");
- file_list = output.splitlines()
+ sys.exit(2)
source_file_re = re.compile(".*((?:\\.c$)|(?:\\.cpp$)|(?:\\.h$)|(?:\\.java$))")
-
- for f in file_list:
- if os.path.isdir(f):
- print("Skipping dir " + f)
- continue
- if not os.path.isfile(f):
- print("Repository file not in working copy: " + f)
- continue;
+ git_cmd = "git ls-files DIR"
- source_file = source_file_re.match(f)
- check_file(f, source_file)
+ for dir in icu_directories_to_be_scanned:
+ print('Scanning ' + dir)
+ output = runCommand(git_cmd.replace("DIR", dir))
+ file_list = output.splitlines()
+
+ for f in file_list:
+ if os.path.isdir(f):
+ print("Skipping dir " + f)
+ continue
+ if not os.path.isfile(f):
+ print("Repository file not in working copy: " + f)
+ continue;
+
+ source_file = source_file_re.match(f)
+ if check_file(f, source_file) != 0:
+ exit_status = 1
+
+ print(exit_status)
+ sys.exit(exit_status)
if __name__ == "__main__":
main(sys.argv[1:])