ICU-21237 Improving how file deletion and cleanup works by adding a build label. See #1246

commit: dcdb30d708f99efa3b7af65fd8984c7e204a93f5 [log] [tgz]
author: David Beaumont <dbeaumont@google.com> Wed Aug 19 21:16:28 2020 +0000
committer: David Beaumont <david.beaumont+github@gmail.com> Wed Aug 19 23:39:58 2020 +0200
tree: d99002bf40f27f07e792a4fa6ec19d09bd87c8f2
parent: 9a6a64ce05b7257a85f8d04125428e4a081105b7 [diff]
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/CleanOutputDirectoryTask.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/CleanOutputDirectoryTask.java
index 382f76d..b2ef028 100644
--- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/CleanOutputDirectoryTask.java
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/CleanOutputDirectoryTask.java

@@ -50,6 +50,21 @@
 
     private static final CharMatcher NOT_WHITESPACE = CharMatcher.whitespace().negate();
 
+    private static final String HEADER_FILE = "ldml2icu_header.txt";
+
+    // If present in the header of a file, this line is used to determine that the file was
+    // auto-generated. This allows us to change the rest of the header freely without issue.
+    // However if it's not present in the file, we fallback to comparing the rest of the
+    // header without it (since that's the old behaviour).
+    // Once there's been an ICU release with this line included in the headers of all data
+    // files, we can remove the fallback and just test for this line and nothing else.
+    private static final String WAS_GENERATED_LABEL =
+        "Generated using tools/cldr/cldr-to-icu/build-icu-data.xml";
+
+    // The number of header lines to check before giving up if we don't find the generated
+    // label.
+    private static final int MAX_HEADER_CHECK_LINES = 20;
+
     private Path root = null;
     private boolean forceDelete = false;
     private final List<Dir> outputDirs = new ArrayList<>();
@@ -57,7 +72,13 @@
 
     public CleanOutputDirectoryTask() {
         // TODO: Consider passing in header lines via Ant?
-        this.headerLines = readLinesFromResource("/ldml2icu_header.txt");
+        this.headerLines = readLinesFromResource("/" + HEADER_FILE);
+        // For now assume that the generated label is the last line of the header.
+        checkState(Iterables.getLast(headerLines).equals(WAS_GENERATED_LABEL),
+            "Expected last line of %s header file to be:\n\t%s", HEADER_FILE, WAS_GENERATED_LABEL);
+        // Make sure we check at least a few more lines than is in the current header.
+        checkState(MAX_HEADER_CHECK_LINES >= headerLines.size() + 5,
+            "Unexpectedly large header file; please increase MAX_HEADER_CHECK_LINES constant");
     }
 
     public static final class Retain extends Task {
@@ -214,31 +235,69 @@
             return false;
         }
         try (BufferedReader r = Files.newBufferedReader(path, UTF_8)) {
-            // A byte-order-mark (BOM) is added to ICU data files, but not JSON deps files, so just
-            // treat it as optional everywhere (it's not the important thing we check here).
-            r.mark(1);
-            int maybeByteOrderMark = r.read();
-            if (maybeByteOrderMark != '\uFEFF') {
-                // Also reset if the file was empty, but that should be harmless.
-                r.reset();
-            }
-            for (String headerLine : headerLines) {
-                String line = r.readLine();
-                if (line == null) {
-                    return false;
-                }
-                int headerStart = skipComment(line);
-                if (headerStart < 0
-                    || !line.regionMatches(headerStart, headerLine, 0, headerLine.length())) {
-                    return false;
-                }
-            }
-            return true;
+            return wasFileAutoGenerated(r, headerLines);
         } catch (IOException e) {
             throw new UncheckedIOException(e);
         }
     }
 
+    // TODO: Once the WAS_GENERATED_LABEL is in all auto-generated ICU data files, simplify this.
+    // Static and non-private for testing.
+    static boolean wasFileAutoGenerated(BufferedReader fileReader, ImmutableList<String> headerLines)
+            throws IOException {
+        // A byte-order-mark (BOM) is added to ICU data files, but not JSON deps files, so just
+        // treat it as optional everywhere (it's not the important thing we check here).
+        fileReader.mark(1);
+        int maybeByteOrderMark = fileReader.read();
+        if (maybeByteOrderMark != '\uFEFF') {
+            // Also reset if the file was empty, but that should be harmless.
+            fileReader.reset();
+        }
+        boolean isLenientHeaderMatchSoFar = true;
+        for (int n = 0; n < MAX_HEADER_CHECK_LINES ; n++) {
+            String line = fileReader.readLine();
+            // True if we have processed the header, not including the trailing generated label.
+            boolean headerIsProcessed = n >= headerLines.size() - 1;
+            boolean isCompleteLenientMatch = isLenientHeaderMatchSoFar && headerIsProcessed;
+            if (line == null) {
+                // We ran off the end of the file, so we're done.
+                return isCompleteLenientMatch;
+            }
+            int headerStart = skipComment(line);
+            if (headerStart < 0) {
+                // We ran off the end of the expected comment section, so we're done.
+                return isCompleteLenientMatch;
+            }
+            if (matchesToEndOfLine(line, headerStart, WAS_GENERATED_LABEL)) {
+                // Finding the generated label trumps any lenient matching.
+                return true;
+            }
+            if (!isLenientHeaderMatchSoFar) {
+                // We already failed at lenient matching, so keep going in the hope of finding
+                // the generated label (we don't need to check the header any more).
+                continue;
+            }
+            if (headerIsProcessed) {
+                // We finishing processing the header and it matched (not including the
+                // generated label) but for older data files, that's fine.
+                return true;
+            }
+            // Check the next header line (not including the trailing generated label).
+            isLenientHeaderMatchSoFar = matchesToEndOfLine(line, headerStart, headerLines.get(n));
+        }
+        // This is actually an unusual case. It corresponds to a file which:
+        // * has a leading comment section at least as long as MAX_HEADER_CHECK_LINES
+        // * does not contain WAS_GENERATED_LABEL anywhere in the first MAX_HEADER_CHECK_LINES
+        // Most files checked are expected to match, and so will not get here, but instead
+        // return via one of the return statements above.
+        return false;
+    }
+
+    private static boolean matchesToEndOfLine(String line, int start, String expected) {
+        return line.length() - start == expected.length()
+                && line.regionMatches(start, expected, 0, expected.length());
+    }
+
     private static int skipComment(String line) {
         if (line.startsWith("#")) {
             return toCommentStart(line, 1);

diff --git a/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_header.txt b/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_header.txt
index 70f6b83..48c27a0 100644
--- a/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_header.txt
+++ b/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_header.txt

@@ -1,2 +1,3 @@
 © 2016 and later: Unicode, Inc. and others.
 License & terms of use: http://www.unicode.org/copyright.html#License
+Generated using tools/cldr/cldr-to-icu/build-icu-data.xml

diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/ant/CleanOutputDirectoryTaskTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/ant/CleanOutputDirectoryTaskTest.java
new file mode 100644
index 0000000..bc8d854
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/ant/CleanOutputDirectoryTaskTest.java

@@ -0,0 +1,84 @@
+package org.unicode.icu.tool.cldrtoicu.ant;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.truth.BooleanSubject;
+import com.google.common.truth.Truth;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Arrays;
+
+@RunWith(JUnit4.class)
+public class CleanOutputDirectoryTaskTest {
+    // Not using the original field since we want this test to fail if this changes unexpectedly.
+    private static final String WAS_GENERATED_LABEL =
+        "Generated using tools/cldr/cldr-to-icu/build-icu-data.xml";
+
+    // Commented version of the label for test data.
+    private static final String WAS_GENERATED_LINE = "// " + WAS_GENERATED_LABEL;
+
+    private final static ImmutableList<String> HEADER = ImmutableList.of(
+        "Header#1",
+        "Header#2",
+        WAS_GENERATED_LABEL);
+
+    @Test
+    public void testWasFileAutoGenerated_lenientMatching() throws IOException {
+        // Testing comment prefixes (// or # supported equally)
+        assertWasAutogenerated("// Header#1", "// Header#2").isTrue();
+        assertWasAutogenerated("# Header#1", "# Header#2").isTrue();
+        assertWasAutogenerated("# Header#1", "// Header#2").isTrue();
+        assertWasAutogenerated("//    Header#1", "//Header#2").isTrue();
+        // Extra lines
+        assertWasAutogenerated("// Header#1", "// Header#2", "// More comment", "Not a comment").isTrue();
+        // BOM is ignored on first line
+        assertWasAutogenerated("\uFEFF// Header#1", "// Header#2").isTrue();
+    }
+
+    @Test
+    public void testWasFileAutoGenerated_lenientMatching_fail() throws IOException {
+        // New blank line.
+        assertWasAutogenerated("", "// Header#1", "// Header#2").isFalse();
+        // Duplicated line.
+        assertWasAutogenerated("// Header#1", "// Header#1").isFalse();
+        // Reversed lines.
+        assertWasAutogenerated("// Header#2", "// Header#1").isFalse();
+        // Not commented.
+        assertWasAutogenerated("Header#1", "Header#2").isFalse();
+        // Extra blank line.
+        assertWasAutogenerated("// Header#1", "", "// Header#2").isFalse();
+        // Misplaced BOM.
+        assertWasAutogenerated("// Header#1", "\uFEFF// Header#2").isFalse();
+    }
+
+    @Test
+    public void testWasFileAutoGenerated_withLabel() throws IOException {
+        // With the label in the header comment section everything passes.
+        assertWasAutogenerated("// Header#1", "// Header#2", WAS_GENERATED_LINE).isTrue();
+        assertWasAutogenerated("// Hello", "// World", WAS_GENERATED_LINE).isTrue();
+        assertWasAutogenerated("// Shorter Header", WAS_GENERATED_LINE).isTrue();
+        assertWasAutogenerated("// This", "// Is", "// A", "// Longer", "// Header", WAS_GENERATED_LINE).isTrue();
+        assertWasAutogenerated("// Where the label is", WAS_GENERATED_LINE, "// Does not matter").isTrue();
+        assertWasAutogenerated(WAS_GENERATED_LINE).isTrue();
+    }
+
+    @Test
+    public void testWasFileAutoGenerated_matchingEndsAfterComments() throws IOException {
+        assertWasAutogenerated("// Header#1", "// Header#Changed", WAS_GENERATED_LINE).isTrue();
+        // Label outside the header comment section does not count.
+        assertWasAutogenerated("// Header#1", "// Header#Changed", "Not a comment", WAS_GENERATED_LINE).isFalse();
+    }
+
+    private static BooleanSubject assertWasAutogenerated(String... fileLines) throws IOException {
+        return Truth.assertWithMessage("wasAutogenerated: %s", Arrays.asList(fileLines))
+            .that(CleanOutputDirectoryTask.wasFileAutoGenerated(testFile(fileLines), HEADER));
+    }
+
+    private static BufferedReader testFile(String... lines) {
+        return new BufferedReader(new StringReader(String.join("\n", lines) + "\n"));
+    }
+}
commit	dcdb30d708f99efa3b7af65fd8984c7e204a93f5	[log] [tgz]
author	David Beaumont <dbeaumont@google.com>	Wed Aug 19 21:16:28 2020 +0000
committer	David Beaumont <david.beaumont+github@gmail.com>	Wed Aug 19 23:39:58 2020 +0200
tree	d99002bf40f27f07e792a4fa6ec19d09bd87c8f2
parent	9a6a64ce05b7257a85f8d04125428e4a081105b7 [diff]