blob: 93f23524c905c596a272f81f1358373a039337a4 [file] [log] [blame]
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.nio.file.StandardOpenOption.CREATE;
import static java.nio.file.StandardOpenOption.CREATE_NEW;
import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING;
import static java.util.stream.Collectors.joining;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Writer;
import java.nio.file.Files;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.collect.Iterables;
/**
* Writes an IcuData object to a text file. A lot of this class was copied directly from the
* original {@code IcuTextWriter} in the CLDR project and has a number of very idiosyncratic
* behaviours. The behaviour of this class is currently tuned to produce perfect parity with
* the original conversion tools, but once migration of the tools is complete, it should
* probably be revisited and tidied up.
*/
// TODO: Link to a definitive specification for the ICU data files and remove the hacks!
final class IcuTextWriter {
private static final String INDENT = " ";
// List of characters to escape in UnicodeSets
// ('\' followed by any of '\', '[', ']', '{', '}', '-', '&', ':', '^', '=').
private static final Pattern UNICODESET_ESCAPE =
Pattern.compile("\\\\[\\\\\\[\\]{}\\-&:^=]");
// Only escape \ and " from other strings.
private static final Pattern STRING_ESCAPE = Pattern.compile("(?!')\\\\\\\\(?!')");
private static final Pattern QUOTE_ESCAPE = Pattern.compile("\\\\?\"");
private static final OpenOption[] ONLY_NEW_FILES = { CREATE_NEW };
private static final OpenOption[] OVERWRITE_FILES = { CREATE, TRUNCATE_EXISTING };
/** Write a file in ICU data format with the specified header. */
static void writeToFile(
IcuData icuData, Path outDir, List<String> header, boolean allowOverwrite) {
try {
Files.createDirectories(outDir);
Path file = outDir.resolve(icuData.getName() + ".txt");
OpenOption[] fileOptions = allowOverwrite ? OVERWRITE_FILES : ONLY_NEW_FILES;
try (Writer w = Files.newBufferedWriter(file, UTF_8, fileOptions);
PrintWriter out = new PrintWriter(w)) {
new IcuTextWriter(icuData).writeTo(out, header);
}
} catch (IOException e) {
throw new RuntimeException("cannot write ICU data file: " + icuData.getName(), e);
}
}
private final IcuData icuData;
private int depth = 0;
private boolean valueWasInline = false;
IcuTextWriter(IcuData icuData) {
this.icuData = checkNotNull(icuData);
}
// TODO: Write a UTF-8 header (see https://unicode-org.atlassian.net/browse/ICU-10197).
private void writeTo(PrintWriter out, List<String> header) {
out.write('\uFEFF');
writeHeaderAndComments(out, header, icuData.getFileComment());
// Write the ICU data to file. This takes the form:
// ----
// <name>{
// foo{
// bar{baz}
// }
// }
// ----
// So it's like every RbPath has an implicit prefix of the IcuData name.
String root = icuData.getName();
if (!icuData.hasFallback()) {
root += ":table(nofallback)";
}
// TODO: Replace with "open(root, out)" once happy with differences (it adds a blank line).
out.print(root);
out.print("{");
depth++;
RbPath lastPath = RbPath.of();
for (RbPath path : icuData.getPaths()) {
// Close any blocks up to the common path length. Since paths are all distinct, the
// common length should always be shorter than either path. We add 1 since we must also
// account for the implicit root segment.
int commonDepth = RbPath.getCommonPrefixLength(lastPath, path) + 1;
// Before closing, the "cursor" is at the end of the last value written.
closeLastPath(commonDepth, out);
// After opening the value will be ready for the next value to be written.
openNextPath(path, out);
valueWasInline = appendValues(icuData.getName(), path, icuData.get(path), out);
lastPath = path;
}
closeLastPath(0, out);
out.println();
out.close();
}
// Before: Cursor is at the end of the previous line.
// After: Cursor is positioned immediately after the last closed '}'
private void closeLastPath(int minDepth, PrintWriter out) {
if (valueWasInline) {
depth--;
out.print('}');
valueWasInline = false;
}
while (depth > minDepth) {
close(out);
}
}
// Before: Cursor is at the end of the previous line.
// After: Cursor is positioned immediately after the newly opened '{'
private void openNextPath(RbPath path, PrintWriter out) {
while (depth <= path.length()) {
// The -1 is to adjust for the implicit root element which means indentation (depth)
// no longer matches the index of the segment we are writing.
open(path.getSegment(depth - 1), out);
}
}
private void open(String label, PrintWriter out) {
newLineAndIndent(out, FormatOptions.PATH_FORMAT);
depth++;
// This handles the "magic" pseudo indexing paths that are added by RegexTransformer.
// These take the form of "<any-string>" and are used to ensure that path order can be
// well defined even for anonymous lists of items.
if (!label.startsWith("<") && !label.endsWith(">")) {
out.print(label);
}
out.print('{');
}
private void close(PrintWriter out) {
depth--;
newLineAndIndent(out, FormatOptions.PATH_FORMAT);
out.print('}');
}
private void newLineAndIndent(PrintWriter out, FormatOptions format) {
out.println();
if (format.shouldIndent) {
for (int i = 0; i < depth; i++) {
out.print(INDENT);
}
}
}
// Currently the "header" uses '//' line comments but the comments are in a block.
// TODO: Sort this out so there isn't a messy mix of comment styles in the data files.
private static void writeHeaderAndComments(
PrintWriter out, List<String> header, List<String> comments) {
header.forEach(s -> out.println("// " + s));
if (!comments.isEmpty()) {
// TODO: Don't use /* */ block quotes, just use inline // quotes.
out.println(
comments.stream().collect(joining("\n * ", "/**\n * ", "\n */")));
}
}
private static final class FormatOptions {
// Only the indent flag is used
final static FormatOptions PATH_FORMAT = new FormatOptions(true, true, true);
static FormatOptions forPath(RbPath rbPath) {
return new FormatOptions(
!rbPath.isIntPath() && !rbPath.isBinPath(),
!rbPath.endsWith(RB_SEQUENCE) && !rbPath.isBinPath(),
!rbPath.isBinPath());
}
final boolean shouldQuote;
final boolean shouldUseComma;
final boolean shouldIndent;
private FormatOptions(boolean shouldQuote, boolean shouldUseComma, boolean shouldIndent) {
this.shouldQuote = shouldQuote;
this.shouldUseComma = shouldUseComma;
this.shouldIndent = shouldIndent;
}
}
/** Inserts padding and values between braces. */
// TODO: Get rid of the need for icuDataName by adding type information to RbPath.
private boolean appendValues(
String icuDataName, RbPath rbPath, List<RbValue> values, PrintWriter out) {
RbValue onlyValue;
boolean wasSingular = false;
FormatOptions format = FormatOptions.forPath(rbPath);
if (values.size() == 1 && !mustBeArray(true, icuDataName, rbPath)) {
onlyValue = values.get(0);
if (onlyValue.isSingleton() && !mustBeArray(false, icuDataName, rbPath)) {
// Value has a single element and is not being forced to be an array.
String onlyElement = Iterables.getOnlyElement(onlyValue.getElements());
if (format.shouldQuote) {
onlyElement = quoteInside(onlyElement);
}
// The numbers below are simply tuned to match the line wrapping in the original
// CLDR code. The behaviour it produces is sometimes strange (wrapping a line just
// for a single character) and could definitely be improved.
// TODO: Simplify this and add hysteresis to ensure less "jarring" line wrapping.
int maxWidth = Math.max(68, 80 - Math.min(4, rbPath.length()) * INDENT.length());
if (onlyElement.length() <= maxWidth) {
// Single element for path: don't add newlines.
printValue(out, onlyElement, format);
wasSingular = true;
} else {
// Element too long to fit in one line, so wrap.
int end;
for (int i = 0; i < onlyElement.length(); i = end) {
end = goodBreak(onlyElement, i + maxWidth);
String part = onlyElement.substring(i, end);
newLineAndIndent(out, format);
printValue(out, part, format);
}
}
} else {
// Only one array for the rbPath, so don't add an extra set of braces.
printElements(out, onlyValue, format);
}
} else {
for (RbValue value : values) {
if (value.isSingleton()) {
// Single-value array: print normally.
printElements(out, value, format);
} else {
// Enclose this array in braces to separate it from other values.
open("", out);
printElements(out, value, format);
close(out);
}
}
}
return wasSingular;
}
private static final RbPath RB_SEQUENCE = RbPath.of("Sequence");
private static final RbPath RB_RULES = RbPath.of("rules");
private static final RbPath RB_LOCALE_SCRIPT = RbPath.of("LocaleScript");
private static final RbPath RB_ERAS = RbPath.of("eras");
private static final RbPath RB_NAMED = RbPath.of("named");
private static final RbPath RB_CALENDAR_PREFERENCE_DATA = RbPath.of("calendarPreferenceData");
private static final RbPath RB_METAZONE_INFO = RbPath.of("metazoneInfo");
/**
* Wrapper for a hack to determine if the given rb path should always present its values as an
* array.
*/
// TODO: Verify this is still needed, and either make it less hacky, or delete it.
private static boolean mustBeArray(boolean topValues, String name, RbPath rbPath) {
if (topValues) {
// matches "rules/setNN" (hence the mucking about with raw segments).
return name.equals("pluralRanges")
&& rbPath.startsWith(RB_RULES)
&& rbPath.getSegment(1).startsWith("set");
}
return rbPath.equals(RB_LOCALE_SCRIPT)
|| (rbPath.contains(RB_ERAS)
&& !rbPath.getSegment(rbPath.length() - 1).endsWith(":alias")
&& !rbPath.endsWith(RB_NAMED))
|| rbPath.startsWith(RB_CALENDAR_PREFERENCE_DATA)
|| rbPath.startsWith(RB_METAZONE_INFO);
}
private void printElements(PrintWriter out, RbValue rbValue, FormatOptions format) {
// TODO: If "shouldUseComma" is made obsolete, just use the "else" block always.
if (rbValue.getElementsPerLine() == 1) {
for (String v : rbValue.getElements()) {
newLineAndIndent(out, format);
printValue(out, quoteInside(v), format);
if (format.shouldUseComma) {
out.print(",");
}
}
} else {
checkArgument(format.shouldUseComma, "cannot group non-sequence values");
Iterable<List<String>> partitions =
Iterables.partition(rbValue.getElements(), rbValue.getElementsPerLine());
for (List<String> tuple : partitions) {
newLineAndIndent(out, format);
for (String v : tuple) {
printValue(out, quoteInside(v), format);
out.print(",");
}
}
}
}
private static void printValue(PrintWriter out, String value, FormatOptions format) {
if (format.shouldQuote) {
out.append('"').append(value).append('"');
} else {
out.append(value);
}
}
// Can a string be broken here? If not, backup until we can.
// TODO: Either don't bother line wrapping or look at making this use a line-break iterator.
private static int goodBreak(String quoted, int end) {
if (end > quoted.length()) {
return quoted.length();
}
// Don't break escaped Unicode characters.
// Need to handle both e.g. \u4E00 and \U00020000
for (int i = end - 1; i > end - 10;) {
char current = quoted.charAt(i--);
if (!Character.toString(current).matches("[0-9A-Fa-f]")) {
if ((current == 'u' || current == 'U') && i > end - 10
&& quoted.charAt(i) == '\\') {
return i;
}
break;
}
}
while (end > 0) {
char ch = quoted.charAt(end - 1);
if (ch != '\\' && (ch < '\uD800' || ch > '\uDFFF')) {
break;
}
--end;
}
return end;
}
// Fix characters inside strings.
private static String quoteInside(String item) {
// Unicode-escape all quotes.
item = QUOTE_ESCAPE.matcher(item).replaceAll("\\\\u0022");
// Double up on backslashes, ignoring Unicode-escaped characters.
Pattern pattern =
item.startsWith("[") && item.endsWith("]") ? UNICODESET_ESCAPE : STRING_ESCAPE;
Matcher matcher = pattern.matcher(item);
if (!matcher.find()) {
return item;
}
StringBuilder buffer = new StringBuilder();
int start = 0;
do {
buffer.append(item, start, matcher.start());
int punctuationChar = item.codePointAt(matcher.end() - 1);
buffer.append("\\");
if (punctuationChar == '\\') {
buffer.append('\\');
}
buffer.append(matcher.group());
start = matcher.end();
} while (matcher.find());
buffer.append(item.substring(start));
return buffer.toString();
}
}