blob: 5df53606ba39146d3e1aa23a7d225b2c99260f3d [file] [log] [blame]
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.base.CharMatcher.whitespace;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkElementIndex;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.ImmutableList.toImmutableList;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Deque;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import com.google.common.base.Joiner;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSetMultimap;
import com.google.common.collect.Iterables;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Multiset;
/**
* Helper tool to dump the resource bundle paths and values from an IcuData instance in a stable
* ordering, to allow easy comparison in cases where ICU ordering changes. This could easily be
* extended to be a more fully featured "diff" tool or a proper ICU data file parser.
*
* <p>This is a temporary debugging tool and should not be relied upon during any part of the data
* generation process.
*/
final class IcuDataDumper {
private static final Joiner LIST_JOINER = Joiner.on(',');
private static final RbPath VERSION = RbPath.of("Version");
public static void main(String... args) throws IOException {
Path fileOrDir;
Optional<Pattern> name = Optional.empty();
switch (args.length) {
case 2:
name = Optional.of(Pattern.compile(args[1]));
case 1:
fileOrDir = Paths.get(args[0]);
break;
default:
throw new IllegalArgumentException("Usage: <file-or-dir> [<name-pattern>]");
}
if (Files.isDirectory(fileOrDir)) {
walkDirectory(fileOrDir, name);
} else {
checkArgument(!name.isPresent(),
"cannot specificy a name pattern for a non-directory file: %s", fileOrDir);
IcuDataParser parser = new IcuDataParser(fileOrDir);
parser.parse();
dump(parser.icuData);
}
}
private static void walkDirectory(Path fileOrDir, Optional<Pattern> name) throws IOException {
Predicate<Path> matchesName =
f -> name.map(n -> n.matcher(f.getFileName().toString()).matches()).orElse(true);
List<IcuDataParser> icuParsers;
try (Stream<Path> files = Files.walk(fileOrDir)) {
icuParsers = files
.filter(Files::isRegularFile)
.filter(matchesName)
.map(IcuDataParser::new)
.collect(toImmutableList());
}
ListMultimap<RbPath, RbValue> allPaths = ArrayListMultimap.create();
for (IcuDataParser p : icuParsers) {
p.parse();
for (RbPath k : p.icuData.keySet()) {
List<RbValue> values = p.icuData.get(k);
if (!allPaths.containsKey(k)) {
allPaths.putAll(k, values);
} else if (!VERSION.equals(k)) {
checkState(allPaths.get(k).equals(values), "inconsistent data for path: ", k);
}
}
}
dump(allPaths);
}
private static void dump(ListMultimap<RbPath, RbValue> allPaths) {
allPaths.keySet().stream()
.sorted()
.forEach(k -> System.out.println(k + " :: " + LIST_JOINER.join(allPaths.get(k))));
}
private static final class IcuDataParser {
// Path of file being parsed.
private final Path path;
// Comments in header (before data starts), without comment characters.
private final List<String> headerComment = new ArrayList<>();
// ICU data name (the name of the root element).
private String name = null;
// ICU data values.
private final ListMultimap<RbPath, RbValue> icuData = ArrayListMultimap.create();
// Current line number (1-indexed).
private int lineNumber = 0;
// The type of the previous line that was processed.
private LineType lastType = LineType.COMMENT;
// True when inside /* .. */ comments in the header.
private boolean inBlockComment = false;
// True when in the final top-level group at the end of parsing.
private boolean inFinalGroup = false;
// True when a partial (line wrapped) value has been read.
private boolean isLineContinuation = false;
// Current path while parsing (NOT including the root element).
private Deque<String> pathStack = new ArrayDeque<>();
// Current sequence of values for the path (as defined in the current path stack).
private List<String> currentValue = new ArrayList<>();
// Current partially read value of a multi-line value.
private String wrappedValue = "";
// Map of indices used to auto-generate names for anonymous path segments.
// TODO: Check if this is even needed and remove if not.
private Multiset<Integer> indices = HashMultiset.create();
IcuDataParser(Path path) {
this.path = checkNotNull(path);
}
public boolean parse() throws IOException {
List<String> lines = Files.readAllLines(path);
// Best approximation to a magic number be have (BOM plus inline comment). This stops
// use trying to parse the transliteration files, which are a different type.
if (!lines.get(0).startsWith("\uFEFF//")) {
return false;
}
lines.stream().map(whitespace()::trimFrom).forEach(this::processLineWithCheck);
// Sanity check for expected final state. Just checking the "lastType" should be enough
// to catch everything else (due to transition rules and how the code tidies up) but it
// seems prudent to sanity check everything just in case.
checkState(lastType == LineType.GROUP_END);
checkState(!inBlockComment);
checkState(name != null);
checkState(pathStack.isEmpty() && inFinalGroup);
checkState(wrappedValue.isEmpty() && currentValue.isEmpty());
return true;
}
void processLineWithCheck(String line) {
lineNumber++;
if (lineNumber == 1 && line.startsWith("\uFEFF")) {
line = line.substring(1);
}
try {
processLine(line);
} catch (RuntimeException e) {
throw new RuntimeException(
String.format("[%s:%s] %s (%s)", path, lineNumber, e.getMessage(), line),
e);
}
}
void processLine(String line) {
line = maybeTrimEndOfLineComment(line);
if (line.isEmpty()) {
return;
}
LineMatch match = LineType.match(line, inBlockComment);
checkState(match.getType().isValidTransitionFrom(lastType),
"invalid state transition: %s --//-> %s", lastType, match.getType());
boolean isEndOfWrappedValue = false;
switch (match.getType()) {
case COMMENT:
if (name != null) {
// Comments in data are ignored since they cannot be properly associated with
// paths or values in an IcuData instance (only legacy tooling emits these).
break;
}
if (line.startsWith("/*")) {
inBlockComment = true;
}
headerComment.add(match.get(0));
if (inBlockComment && line.contains("*/")) {
checkState(line.indexOf("*/") == line.length() - 2,
"unexpected end of comment block");
inBlockComment = false;
}
break;
case INLINE_VALUE:
icuData.put(
getPathFromStack().extendBy(getSegment(match.get(0))),
RbValue.of(unquote(match.get(1))));
break;
case GROUP_START:
checkState(currentValue.isEmpty());
if (name == null) {
name = match.get(0);
checkState(name != null, "cannot have anonymous top-level group");
} else {
pathStack.push(getSegment(match.get(0)));
}
wrappedValue = "";
isLineContinuation = false;
break;
case QUOTED_VALUE:
wrappedValue += unquote(match.get(0));
isLineContinuation = !line.endsWith(",");
if (!isLineContinuation) {
currentValue.add(wrappedValue);
wrappedValue = "";
}
break;
case VALUE:
checkState(!isLineContinuation, "unexpected unquoted value");
currentValue.add(match.get(0));
break;
case GROUP_END:
// Account for quoted values without trailing ',' just before group end.
if (isLineContinuation) {
currentValue.add(wrappedValue);
isLineContinuation = false;
}
// Emit the collection sequence of values for the current path as an RbValue.
if (!currentValue.isEmpty()) {
icuData.put(getPathFromStack(), RbValue.of(currentValue));
currentValue.clear();
}
// Annoyingly the name is outside the stack so the stack will empty before the last
// end group.
if (!pathStack.isEmpty()) {
pathStack.pop();
indices.setCount(pathStack.size(), 0);
} else {
checkState(!inFinalGroup, "unexpected group end");
inFinalGroup = true;
}
break;
case UNKNOWN:
throw new IllegalStateException("cannot parse line: " + match.get(0));
}
lastType = match.getType();
}
private RbPath getPathFromStack() {
if (pathStack.isEmpty()) {
return RbPath.of();
}
List<String> segments = new ArrayList<>();
Iterables.addAll(segments, pathStack);
if (segments.get(0).matches("<[0-9]{4}>")) {
segments.remove(0);
}
return RbPath.of(Lists.reverse(segments));
}
private String getSegment(String segmentOrNull) {
if (segmentOrNull != null) {
return segmentOrNull;
}
int depth = pathStack.size();
int index = indices.count(depth);
indices.add(depth, 1);
return String.format("<%04d>", index);
}
private String maybeTrimEndOfLineComment(String line) {
// Once the name is set, we are past the header and into the data.
if (name != null) {
// Index to search for '//' from - must skip quoted values.
int startIdx = line.startsWith("\"") ? line.indexOf('"', 1) + 1 : 0;
int commentIdx = line.indexOf("//", startIdx);
if (commentIdx != -1) {
line = whitespace().trimTrailingFrom(line.substring(0, commentIdx));
}
}
return line;
}
private static String unquote(String s) {
if (s.startsWith("\"") && s.endsWith("\"")) {
return s.substring(1, s.length() - 1).replaceAll("\\\\([\"\\\\])", "$1");
}
checkState(!s.contains("\""), "invalid unquoted value: %s", s);
return s;
}
private static final class LineMatch {
private final LineType type;
private final Function<Integer, String> args;
LineMatch(LineType type, Function<Integer, String> args) {
this.type = checkNotNull(type);
this.args = checkNotNull(args);
}
String get(int n) {
return args.apply(n);
}
LineType getType() {
return type;
}
}
private enum LineType {
// Comment _start_ with any comment value captured.
COMMENT("(?://|/\\*)\\s*(.*)"),
// A combination of GROUP_START, VALUE and GROUP_END with whitespace.
INLINE_VALUE("(?:(.*\\S)\\s*)?\\{\\s*((?:\".*\")|(?:[^\"{}]*\\S))\\s*\\}"),
// Allows for empty segment names (anonymous arrays) which match 'null'.
GROUP_START("(?:(.*\\S)\\s*)?\\{"),
GROUP_END("\\}"),
QUOTED_VALUE("(\".*\"),?"),
VALUE("([^\"{}]+),?"),
UNKNOWN(".*");
// Table of allowed transitions expected during parsing.
// key=current state, values=set of permitted previous states
private static ImmutableSetMultimap<LineType, LineType> TRANSITIONS =
ImmutableSetMultimap.<LineType, LineType>builder()
.putAll(COMMENT, COMMENT)
.putAll(INLINE_VALUE, COMMENT, INLINE_VALUE, GROUP_START, GROUP_END)
.putAll(GROUP_START, COMMENT, GROUP_START, GROUP_END, INLINE_VALUE)
.putAll(VALUE, GROUP_START, VALUE, QUOTED_VALUE)
.putAll(QUOTED_VALUE, GROUP_START, VALUE, QUOTED_VALUE)
.putAll(GROUP_END, GROUP_END, INLINE_VALUE, VALUE, QUOTED_VALUE)
.build();
private final Pattern pattern;
LineType(String regex) {
this.pattern = Pattern.compile(regex);
}
boolean isValidTransitionFrom(LineType lastType) {
return TRANSITIONS.get(this).contains(lastType);
}
static LineMatch match(String line, boolean inBlockComment) {
// Block comments kinda suck and it'd be great if the ICU data only used '//' style
// comments (if would definitely simplify any parsers out there). Once the
// transition to the new transformation tools is complete, they can be changed to
// only emit '//' style comments.
if (inBlockComment) {
if (line.startsWith("*")) {
line = whitespace().trimLeadingFrom(line.substring(1));
}
return new LineMatch(COMMENT, ImmutableList.of(line)::get);
}
for (LineType type : TRANSITIONS.keySet()) {
// Regex groups start at 1, but we want the getter function to be zero-indexed.
Matcher m = type.pattern.matcher(line);
if (m.matches()) {
return new LineMatch(type, n -> {
checkElementIndex(n, m.groupCount());
return m.group(n + 1);
});
}
}
return new LineMatch(UNKNOWN, ImmutableList.of(line)::get);
}
}
}
}