blob: b6843134b57dc9ba8720a54cdb7734bbe1be7a9a [file] [log] [blame]
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.base.CharMatcher.whitespace;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.ImmutableList.toImmutableList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Objects;
import java.util.function.Function;
import com.google.common.base.CharMatcher;
import com.google.common.base.Splitter;
import com.google.common.collect.Comparators;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
/**
* A resource bundle path, used to identify entries in ICU data.
*
* <p>Immutable and thread safe.
*/
public final class RbPath implements Comparable<RbPath> {
private static final Splitter PATH_SPLITTER = Splitter.on('/').trimResults();
// This defines ordering of paths in IcuData instances and thus the order in ICU data files.
// If there's ever a reason to have a different "natural" order for paths, this Comparator
// should be moved into the ICU file writer class(es).
private static final Comparator<RbPath> ORDERING =
Comparator.comparing(
p -> p.segments,
Comparators.lexicographical(Comparator.<String>naturalOrder()));
// Matches the definition of invariant characters in "uinvchar.cpp". We can make this all much
// faster if needed with a custom matcher (it's just a 128 way bit lookup via 2 longs).
private static final CharMatcher INVARIANT_CHARS =
CharMatcher.ascii().and(CharMatcher.anyOf("!#$@[\\]^`{|}~").negate());
// Note that we must also prohibit double-quote from appearing anywhere other than surrounding
// segment values. This is because some segment values can contain special ICU data characters
// (e.g. ':') but must be treated as literals. There is not proper "escaping" mechanism in ICU
// data for key values (since '\' is not an invariant, things like \\uxxxx are not possible).
//
// Ideally quoting would be done when the file is written, but that would require additional
// complexity in RbPath, since suffixes like ":intvector" must not be quoted and must somehow
// be distinguished from timezone "metazone" names which also contain ':'.
private static final CharMatcher QUOTED_SEGMENT_CHARS =
INVARIANT_CHARS
.and(CharMatcher.javaIsoControl().negate())
.and(CharMatcher.isNot('"'));
private static final CharMatcher UNQUOTED_SEGMENT_CHARS =
QUOTED_SEGMENT_CHARS.and(whitespace().negate());
/**
* Returns a path with the specified segments in (possibly empty). Note that unlike
* {@link #parse(String)}, {@code '/'} is not treated specially and can be present in a path
* element constructed by this method.
*/
public static RbPath of(String... segments) {
return of(Arrays.asList(segments));
}
/**
* Returns a path with the specified segments in (possibly empty). Note that unlike
* {@link #parse(String)}, {@code '/'} is not treated specially and can be present in a path
* element constructed by this method.
*/
public static RbPath of(Iterable<String> segments) {
return new RbPath(segments);
}
/** Parses the given path string, assuming {@code '/'} as a path separator. */
public static RbPath parse(String path) {
checkArgument(!path.isEmpty(), "cannot parse an empty path string");
// Allow leading '/', but don't allow empty segments anywhere else.
if (path.startsWith("/")) {
path = path.substring(1);
}
return new RbPath(PATH_SPLITTER.split(path));
}
/** Returns the common prefix length of two paths (useful when thinking of path hierarchies). */
public static int getCommonPrefixLength(RbPath lhs, RbPath rhs) {
int maxLength = Math.min(lhs.length(), rhs.length());
int n = 0;
while (n < maxLength && lhs.getSegment(n).equals(rhs.getSegment(n))) {
n++;
}
return n;
}
private final ImmutableList<String> segments;
private final int hashCode;
private RbPath(Iterable<String> segments) {
this.segments = ImmutableList.copyOf(segments);
// Use "this.segments" since the incoming list can have a different hash!
this.hashCode = Objects.hash(this.segments);
for (String segment : this.segments) {
checkArgument(!segment.isEmpty(), "path segments must not be empty: %s", this.segments);
// Either the label is quoted (e.g. "foo") or it is bar (e.g. foo) but it can only
// contain double quotes at either end, or not at all. If the string is quoted, only
// validate the content, and not the quotes themselves.
switch (segment.charAt(0)) {
case '<':
// Allow anything in hidden labels, since they will be removed later and never
// appear in the final ICU data.
checkArgument(segment.endsWith(">"),
"mismatched quoting for hidden label: %s", segment);
continue;
case '"':
checkArgument(segment.endsWith("\""),
"mismatched quoting for segment: %s", segment);
checkArgument(
QUOTED_SEGMENT_CHARS.matchesAllOf(segment.substring(1, segment.length() - 1)),
"invalid character in unquoted resource bundle path segment: %s", segment);
break;
default:
checkArgument(
UNQUOTED_SEGMENT_CHARS.matchesAllOf(segment),
"invalid character in unquoted resource bundle path segment: %s", segment);
break;
}
}
}
/** Returns the number of segments in this path. */
public int length() {
return segments.size();
}
/** Returns the Nth segments in this path. */
public String getSegment(int n) {
return segments.get(n);
}
/** Returns a new path extended at the end by the specified segments. */
public RbPath extendBy(String... parts) {
return new RbPath(Iterables.concat(segments, Arrays.asList(parts)));
}
/** Returns whether this path starts with the specified prefix. */
public boolean startsWith(RbPath prefix) {
return prefix.length() <= length() && matchesSublist(prefix, 0);
}
/** Returns whether this path ends with the specified suffix. */
public boolean endsWith(RbPath suffix) {
return suffix.length() <= length() && matchesSublist(suffix, length() - suffix.length());
}
/** Returns whether this path contains the specified path. */
public boolean contains(RbPath path) {
int maxOffset = length() - path.length();
for (int i = 0; i <= maxOffset; i++) {
if (matchesSublist(path, i)) {
return true;
}
}
return false;
}
// Assume length check has been done.
private boolean matchesSublist(RbPath path, int offset) {
for (int i = 0; i < path.length(); i++) {
if (!path.getSegment(i).equals(getSegment(i + offset))) {
return false;
}
}
return true;
}
// TODO: Remove this and isAnonymous() since they are only called once each, in the same place.
public RbPath getParent() {
checkState(length() > 0, "cannot get parent of the empty path");
return new RbPath(segments.subList(0, length() - 1));
}
public boolean isAnonymous() {
return length() > 0 && segments.get(length() - 1).charAt(0) == '<';
}
// TODO: Remove this special case code (called exactly once).
public RbPath mapSegments(Function<? super String, String> fn) {
return new RbPath(segments.stream().map(fn).collect(toImmutableList()));
}
// TODO: Remove this in favour of having properly typed paths.
boolean isIntPath() {
String lastElement = segments.get(segments.size() - 1);
return lastElement.endsWith(":int") || lastElement.endsWith(":intvector");
}
@Override public int compareTo(RbPath other) {
return ORDERING.compare(this, other);
}
@Override public boolean equals(Object other) {
return (other instanceof RbPath) && segments.equals(((RbPath) other).segments);
}
@Override public int hashCode() {
return hashCode;
}
@Override public String toString() {
return String.join("/", segments);
}
}