blob: 9d076b2cc74ca0cf3da10556e9379c886846262d [file] [log] [blame]
/* See LICENSE file for copyright and license details. */
#include <ctype.h>
#include <errno.h>
#include <inttypes.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "util.h"
struct range {
uint_least32_t lower;
uint_least32_t upper;
};
struct properties_payload {
struct properties *prop;
const struct property_spec *spec;
uint_least8_t speclen;
int (*set_value)(struct properties_payload *, uint_least32_t,
int_least64_t);
uint_least8_t (*handle_conflict)(uint_least32_t, uint_least8_t,
uint_least8_t);
};
struct break_test_payload {
struct break_test **test;
size_t *testlen;
};
static void *
reallocate_array(void *p, size_t len, size_t size)
{
if (len > 0 && size > SIZE_MAX / len) {
errno = ENOMEM;
return NULL;
}
return realloc(p, len * size);
}
int
hextocp(const char *str, size_t len, uint_least32_t *cp)
{
size_t i;
int off;
char relative;
/* the maximum valid codepoint is 0x10FFFF */
if (len > 6) {
fprintf(stderr, "hextocp: '%.*s' is too long.\n", (int)len,
str);
return 1;
}
for (i = 0, *cp = 0; i < len; i++) {
if (str[i] >= '0' && str[i] <= '9') {
relative = '0';
off = 0;
} else if (str[i] >= 'a' && str[i] <= 'f') {
relative = 'a';
off = 10;
} else if (str[i] >= 'A' && str[i] <= 'F') {
relative = 'A';
off = 10;
} else {
fprintf(stderr, "hextocp: '%.*s' is not hexadecimal.\n",
(int)len, str);
return 1;
}
*cp += ((uint_least32_t)1 << (4 * (len - i - 1))) *
(uint_least32_t)(str[i] - relative + off);
}
if (*cp > UINT32_C(0x10FFFF)) {
fprintf(stderr, "hextocp: '%.*s' is too large.\n", (int)len,
str);
return 1;
}
return 0;
}
int
parse_cp_list(const char *str, uint_least32_t **cp, size_t *cplen)
{
size_t count, i;
const char *tmp1 = NULL, *tmp2 = NULL;
if (strlen(str) == 0) {
*cp = NULL;
*cplen = 0;
return 0;
}
/* count the number of spaces in the string and infer list length */
for (count = 1, tmp1 = str; (tmp2 = strchr(tmp1, ' ')) != NULL;
count++, tmp1 = tmp2 + 1) {
;
}
/* allocate resources */
if (!(*cp = calloc((*cplen = count), sizeof(**cp)))) {
fprintf(stderr, "calloc: %s\n", strerror(errno));
exit(1);
}
/* go through the string again, parsing the numbers */
for (i = 0, tmp1 = tmp2 = str; tmp2 != NULL; i++) {
tmp2 = strchr(tmp1, ' ');
if (hextocp(tmp1, tmp2 ? (size_t)(tmp2 - tmp1) : strlen(tmp1),
&((*cp)[i]))) {
return 1;
}
if (tmp2 != NULL) {
tmp1 = tmp2 + 1;
}
}
return 0;
}
static int
range_parse(const char *str, struct range *range)
{
char *p;
if ((p = strstr(str, "..")) == NULL) {
/* input has the form "XXXXXX" */
if (hextocp(str, strlen(str), &range->lower)) {
return 1;
}
range->upper = range->lower;
} else {
/* input has the form "XXXXXX..XXXXXX" */
if (hextocp(str, (size_t)(p - str), &range->lower) ||
hextocp(p + 2, strlen(p + 2), &range->upper)) {
return 1;
}
}
return 0;
}
void
parse_file_with_callback(const char *fname,
int (*callback)(const char *, char **, size_t, char *,
void *),
void *payload)
{
FILE *fp;
char *line = NULL, **field = NULL, *comment;
size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields;
ssize_t len;
/* open file */
if (!(fp = fopen(fname, "r"))) {
fprintf(stderr, "parse_file_with_callback: fopen '%s': %s.\n",
fname, strerror(errno));
exit(1);
}
while ((len = getline(&line, &linebufsize, fp)) >= 0) {
/* remove trailing newline */
if (len > 0 && line[len - 1] == '\n') {
line[len - 1] = '\0';
len--;
}
/* skip empty lines and comment lines */
if (len == 0 || line[0] == '#') {
continue;
}
/* tokenize line into fields */
for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) {
/* skip leading whitespace */
while (line[i] == ' ') {
i++;
}
/* check if we crashed into the comment */
if (line[i] != '#') {
/* extend field buffer, if necessary */
if (++nfields > fieldbufsize) {
if ((field = realloc(
field,
nfields *
sizeof(*field))) ==
NULL) {
fprintf(stderr,
"parse_file_with_"
"callback: realloc: "
"%s.\n",
strerror(errno));
exit(1);
}
fieldbufsize = nfields;
}
/* set current position as field start */
field[nfields - 1] = &line[i];
/* continue until we reach ';' or '#' or end */
while (line[i] != ';' && line[i] != '#' &&
line[i] != '\0') {
i++;
}
}
if (line[i] == '#') {
/* set comment-variable for later */
comment = &line[i + 1];
}
/* go back whitespace and terminate field there */
if (i > 0) {
for (j = i - 1; line[j] == ' '; j--) {
;
}
line[j + 1] = '\0';
} else {
line[i] = '\0';
}
/* if comment is set, we are done */
if (comment != NULL) {
break;
}
}
/* skip leading whitespace in comment */
while (comment != NULL && comment[0] == ' ') {
comment++;
}
/* call callback function */
if (callback(fname, field, nfields, comment, payload)) {
fprintf(stderr, "parse_file_with_callback: "
"Malformed input.\n");
exit(1);
}
}
free(line);
free(field);
}
static int
properties_callback(const char *file, char **field, size_t nfields,
char *comment, void *payload)
{
/* prop always has the length 0x110000 */
struct properties_payload *p = (struct properties_payload *)payload;
struct range r;
uint_least8_t i;
uint_least32_t cp;
(void)comment;
if (nfields < 2) {
return 1;
}
for (i = 0; i < p->speclen; i++) {
/* identify fitting file and identifier */
if (p->spec[i].file && !strcmp(p->spec[i].file, file) &&
(!strcmp(p->spec[i].ucdname, field[1]) ||
(comment != NULL &&
!strncmp(p->spec[i].ucdname, comment,
strlen(p->spec[i].ucdname)) &&
comment[strlen(p->spec[i].ucdname)] == ' '))) {
/* parse range in first field */
if (range_parse(field[0], &r)) {
return 1;
}
/* apply to all codepoints in the range */
for (cp = r.lower; cp <= r.upper; cp++) {
if (p->set_value(payload, cp, i)) {
exit(1);
}
}
break;
}
}
return 0;
}
void
properties_compress(const struct properties *prop,
struct properties_compressed *comp)
{
uint_least32_t cp, i;
/* initialization */
if (!(comp->offset = malloc((size_t)UINT32_C(0x110000) *
sizeof(*(comp->offset))))) {
fprintf(stderr, "malloc: %s\n", strerror(errno));
exit(1);
}
comp->data = NULL;
comp->datalen = 0;
for (cp = 0; cp < UINT32_C(0x110000); cp++) {
for (i = 0; i < comp->datalen; i++) {
if (!memcmp(&(prop[cp]), &(comp->data[i]),
sizeof(*prop))) {
/* found a match! */
comp->offset[cp] = i;
break;
}
}
if (i == comp->datalen) {
/*
* found no matching properties-struct, so
* add current properties to data and add the
* offset in the offset-table
*/
if (!(comp->data = reallocate_array(
comp->data, ++(comp->datalen),
sizeof(*(comp->data))))) {
fprintf(stderr, "reallocate_array: %s\n",
strerror(errno));
exit(1);
}
memcpy(&(comp->data[comp->datalen - 1]), &(prop[cp]),
sizeof(*prop));
comp->offset[cp] = comp->datalen - 1;
}
}
}
double
properties_get_major_minor(const struct properties_compressed *comp,
struct properties_major_minor *mm)
{
size_t i, j, compression_count = 0;
/*
* we currently have an array comp->offset which maps the
* codepoints 0..0x110000 to offsets into comp->data.
* To improve cache-locality instead and allow a bit of
* compressing, instead of directly mapping a codepoint
* 0xAAAABB with comp->offset, we generate two arrays major
* and minor such that
* comp->offset(0xAAAABB) == minor[major[0xAAAA] + 0xBB]
* This yields a major-array of length 2^16 and a minor array
* of variable length depending on how many common subsequences
* can be filtered out.
*/
/* initialize */
if (!(mm->major = malloc((size_t)0x1100 * sizeof(*(mm->major))))) {
fprintf(stderr, "malloc: %s\n", strerror(errno));
exit(1);
}
mm->minor = NULL;
mm->minorlen = 0;
for (i = 0; i < (size_t)0x1100; i++) {
/*
* we now look at the cp-range (i << 8)..(i << 8 + 0xFF)
* and check if its corresponding offset-data already
* exists in minor (because then we just point there
* and need less storage)
*/
for (j = 0; j + 0xFF < mm->minorlen; j++) {
if (!memcmp(&(comp->offset[i << 8]), &(mm->minor[j]),
sizeof(*(comp->offset)) * 0x100)) {
break;
}
}
if (j + 0xFF < mm->minorlen) {
/* found an index */
compression_count++;
mm->major[i] = j;
} else {
/*
* add "new" sequence to minor and point to it
* in major
*/
mm->minorlen += 0x100;
if (!(mm->minor =
reallocate_array(mm->minor, mm->minorlen,
sizeof(*(mm->minor))))) {
fprintf(stderr, "reallocate_array: %s\n",
strerror(errno));
exit(1);
}
memcpy(&(mm->minor[mm->minorlen - 0x100]),
&(comp->offset[i << 8]),
sizeof(*(mm->minor)) * 0x100);
mm->major[i] = mm->minorlen - 0x100;
}
}
/* return compression ratio */
return (double)compression_count / 0x1100 * 100;
}
void
properties_print_lookup_table(const char *name, const size_t *data,
size_t datalen)
{
const char *type;
size_t i, maxval;
for (i = 0, maxval = 0; i < datalen; i++) {
if (data[i] > maxval) {
maxval = data[i];
}
}
type = (maxval <= UINT_LEAST8_MAX) ? "uint_least8_t" :
(maxval <= UINT_LEAST16_MAX) ? "uint_least16_t" :
(maxval <= UINT_LEAST32_MAX) ? "uint_least32_t" :
"uint_least64_t";
printf("static const %s %s[] = {\n\t", type, name);
for (i = 0; i < datalen; i++) {
printf("%zu", data[i]);
if (i + 1 == datalen) {
printf("\n");
} else if ((i + 1) % 8 != 0) {
printf(", ");
} else {
printf(",\n\t");
}
}
printf("};\n");
}
void
properties_print_derived_lookup_table(
char *name, size_t *offset, size_t offsetlen,
int_least64_t (*get_value)(const struct properties *, size_t),
const void *payload)
{
const char *type;
size_t i;
int_least64_t minval, maxval;
for (i = 0, minval = INT_LEAST64_MAX, maxval = INT_LEAST64_MIN;
i < offsetlen; i++) {
if (get_value(payload, offset[i]) > maxval) {
maxval = get_value(payload, offset[i]);
} else if (get_value(payload, offset[i]) < minval) {
minval = get_value(payload, offset[i]);
}
}
if (minval < 0) {
/* we need a signed type */
type = (minval >= INT_LEAST8_MIN && maxval <= INT_LEAST8_MAX) ?
"int_least8_t" :
(minval >= INT_LEAST16_MIN &&
maxval <= INT_LEAST16_MAX) ?
"int_least16_t" :
(minval >= INT_LEAST32_MIN &&
maxval <= INT_LEAST32_MAX) ?
"int_least32_t" :
"int_least64_t";
} else {
/* we are fine with an unsigned type */
type = (maxval <= UINT_LEAST8_MAX) ? "uint_least8_t" :
(maxval <= UINT_LEAST16_MAX) ? "uint_least16_t" :
(maxval <= UINT_LEAST32_MAX) ? "uint_least32_t" :
"uint_least64_t";
}
printf("static const %s %s[] = {\n\t", type, name);
for (i = 0; i < offsetlen; i++) {
printf("%" PRIiLEAST64, get_value(payload, offset[i]));
if (i + 1 == offsetlen) {
printf("\n");
} else if ((i + 1) % 8 != 0) {
printf(", ");
} else {
printf(",\n\t");
}
}
printf("};\n");
}
static void
properties_print_enum(const struct property_spec *spec, size_t speclen,
const char *enumname, const char *enumprefix)
{
size_t i;
printf("enum %s {\n", enumname);
for (i = 0; i < speclen; i++) {
printf("\t%s_%s,\n", enumprefix, spec[i].enumname);
}
printf("\tNUM_%sS,\n};\n\n", enumprefix);
}
static int
set_value_bp(struct properties_payload *payload, uint_least32_t cp,
int_least64_t value)
{
if (payload->prop[cp].property != payload->speclen) {
if (payload->handle_conflict == NULL) {
fprintf(stderr,
"set_value_bp: "
"Unhandled character break property "
"overwrite for 0x%06X (%s <- %s).\n",
cp,
payload->spec[payload->prop[cp].property]
.enumname,
payload->spec[value].enumname);
return 1;
} else {
value = payload->handle_conflict(
cp, (uint_least8_t)payload->prop[cp].property,
(uint_least8_t)value);
}
}
payload->prop[cp].property = value;
return 0;
}
static int_least64_t
get_value_bp(const struct properties *prop, size_t offset)
{
return prop[offset].property;
}
void
properties_generate_break_property(
const struct property_spec *spec, uint_least8_t speclen,
uint_least8_t (*fill_missing)(uint_least32_t),
uint_least8_t (*handle_conflict)(uint_least32_t, uint_least8_t,
uint_least8_t),
void (*post_process)(struct properties *), const char *prefix,
const char *argv0)
{
struct properties_compressed comp;
struct properties_major_minor mm;
struct properties_payload payload;
struct properties *prop;
size_t i, j, prefixlen = strlen(prefix);
char buf1[64], prefix_uc[64], buf2[64], buf3[64], buf4[64];
/*
* allocate property buffer for all 0x110000 codepoints and
* initialize its entries to the known invalid value "speclen"
*/
if (!(prop = calloc(UINT32_C(0x110000), sizeof(*prop)))) {
fprintf(stderr, "calloc: %s\n", strerror(errno));
exit(1);
}
for (i = 0; i < UINT32_C(0x110000); i++) {
prop[i].property = speclen;
}
/* generate data */
payload.prop = prop;
payload.spec = spec;
payload.speclen = speclen;
payload.set_value = set_value_bp;
payload.handle_conflict = handle_conflict;
/* parse each file exactly once and ignore NULL-fields */
for (i = 0; i < speclen; i++) {
for (j = 0; j < i; j++) {
if (spec[i].file && spec[j].file &&
!strcmp(spec[i].file, spec[j].file)) {
/* file has already been parsed */
break;
}
}
if (i == j && spec[i].file) {
/* file has not been processed yet */
parse_file_with_callback(spec[i].file,
properties_callback, &payload);
}
}
/* fill in the missing properties that weren't explicitly given */
for (i = 0; i < UINT32_C(0x110000); i++) {
if (payload.prop[i].property == speclen) {
if (fill_missing != NULL) {
payload.prop[i].property =
fill_missing((uint_least32_t)i);
} else {
payload.prop[i].property = 0;
}
}
}
/* post-processing */
if (post_process != NULL) {
post_process(payload.prop);
}
/* compress data */
printf("/* Automatically generated by %s */\n#include <stdint.h>\n\n",
argv0);
properties_compress(prop, &comp);
fprintf(stderr, "%s: %s-LUT compression-ratio: %.2f%%\n", argv0, prefix,
properties_get_major_minor(&comp, &mm));
/* prepare names */
if ((size_t)snprintf(buf1, LEN(buf1), "%s_property", prefix) >=
LEN(buf1)) {
fprintf(stderr, "snprintf: String truncated.\n");
exit(1);
}
if (LEN(prefix_uc) + 1 < prefixlen) {
fprintf(stderr, "snprintf: Buffer too small.\n");
exit(1);
}
for (i = 0; i < prefixlen; i++) {
prefix_uc[i] = (char)toupper(prefix[i]);
}
prefix_uc[prefixlen] = '\0';
if ((size_t)snprintf(buf2, LEN(buf2), "%s_PROP", prefix_uc) >=
LEN(buf2) ||
(size_t)snprintf(buf3, LEN(buf3), "%s_major", prefix) >=
LEN(buf3) ||
(size_t)snprintf(buf4, LEN(buf4), "%s_minor", prefix) >=
LEN(buf4)) {
fprintf(stderr, "snprintf: String truncated.\n");
exit(1);
}
/* print data */
properties_print_enum(spec, speclen, buf1, buf2);
properties_print_lookup_table(buf3, mm.major, 0x1100);
printf("\n");
properties_print_derived_lookup_table(buf4, mm.minor, mm.minorlen,
get_value_bp, comp.data);
/* free data */
free(prop);
free(comp.data);
free(comp.offset);
free(mm.major);
free(mm.minor);
}
static int
break_test_callback(const char *fname, char **field, size_t nfields,
char *comment, void *payload)
{
struct break_test *t,
**test = ((struct break_test_payload *)payload)->test;
size_t i, *testlen = ((struct break_test_payload *)payload)->testlen;
char *token;
(void)fname;
if (nfields < 1) {
return 1;
}
/* append new testcase and initialize with zeroes */
if ((*test = realloc(*test, ++(*testlen) * sizeof(**test))) == NULL) {
fprintf(stderr, "break_test_callback: realloc: %s.\n",
strerror(errno));
return 1;
}
t = &(*test)[*testlen - 1];
memset(t, 0, sizeof(*t));
/* parse testcase "<÷|×> <cp> <÷|×> ... <cp> <÷|×>" */
for (token = strtok(field[0], " "), i = 0; token != NULL;
i++, token = strtok(NULL, " ")) {
if (i % 2 == 0) {
/* delimiter or start of sequence */
if (i == 0 ||
!strncmp(token, "\xC3\xB7", 2)) { /* UTF-8 */
/*
* '÷' indicates a breakpoint,
* the current length is done; allocate
* a new length field and set it to 0
*/
if ((t->len = realloc(
t->len,
++t->lenlen * sizeof(*t->len))) ==
NULL) {
fprintf(stderr,
"break_test_"
"callback: realloc: %s.\n",
strerror(errno));
return 1;
}
t->len[t->lenlen - 1] = 0;
} else if (!strncmp(token, "\xC3\x97", 2)) { /* UTF-8 */
/* '×' indicates a non-breakpoint, do nothing */
} else {
fprintf(stderr,
"break_test_callback: "
"Malformed delimiter '%s'.\n",
token);
return 1;
}
} else {
/* add codepoint to cp-array */
if ((t->cp = realloc(t->cp,
++t->cplen * sizeof(*t->cp))) ==
NULL) {
fprintf(stderr,
"break_test_callback: "
"realloc: %s.\n",
strerror(errno));
return 1;
}
if (hextocp(token, strlen(token),
&t->cp[t->cplen - 1])) {
return 1;
}
if (t->lenlen > 0) {
t->len[t->lenlen - 1]++;
}
}
}
if (t->lenlen > 0 && t->len[t->lenlen - 1] == 0) {
/*
* we allocated one more length than we needed because
* the breakpoint was at the end
*/
t->lenlen--;
}
/* store comment */
if (comment != NULL &&
((*test)[*testlen - 1].descr = strdup(comment)) == NULL) {
fprintf(stderr, "break_test_callback: strdup: %s.\n",
strerror(errno));
return 1;
}
return 0;
}
void
break_test_list_parse(char *fname, struct break_test **test, size_t *testlen)
{
struct break_test_payload pl = {
.test = test,
.testlen = testlen,
};
*test = NULL;
*testlen = 0;
parse_file_with_callback(fname, break_test_callback, &pl);
}
void
break_test_list_print(const struct break_test *test, size_t testlen,
const char *identifier, const char *progname)
{
size_t i, j;
printf("/* Automatically generated by %s */\n"
"#include <stdint.h>\n#include <stddef.h>\n\n"
"#include \"../gen/types.h\"\n\n",
progname);
printf("static const struct break_test %s[] = {\n", identifier);
for (i = 0; i < testlen; i++) {
printf("\t{\n");
printf("\t\t.cp = (uint_least32_t[]){");
for (j = 0; j < test[i].cplen; j++) {
printf(" UINT32_C(0x%06X)", test[i].cp[j]);
if (j + 1 < test[i].cplen) {
putchar(',');
}
}
printf(" },\n");
printf("\t\t.cplen = %zu,\n", test[i].cplen);
printf("\t\t.len = (size_t[]){");
for (j = 0; j < test[i].lenlen; j++) {
printf(" %zu", test[i].len[j]);
if (j + 1 < test[i].lenlen) {
putchar(',');
}
}
printf(" },\n");
printf("\t\t.lenlen = %zu,\n", test[i].lenlen);
printf("\t\t.descr = \"%s\",\n", test[i].descr);
printf("\t},\n");
}
printf("};\n");
}
void
break_test_list_free(struct break_test *test, size_t testlen)
{
size_t i;
for (i = 0; i < testlen; i++) {
free(test[i].cp);
free(test[i].len);
free(test[i].descr);
}
free(test);
}