blob: 129648ec5b093f6a042891672a356b23886d74db [file] [log] [blame] [edit]
/* See LICENSE file for copyright and license details. */
#include <ctype.h>
#include <errno.h>
#include <inttypes.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "util.h"
struct range {
uint_least32_t lower;
uint_least32_t upper;
};
static int
hextocp(const char *str, size_t len, uint_least32_t *cp)
{
size_t i;
int off;
char relative;
/* the maximum valid codepoint is 0x10FFFF */
if (len > 6) {
fprintf(stderr, "hextocp: '%.*s' is too long.\n", (int)len,
str);
return 1;
}
for (i = 0, *cp = 0; i < len; i++) {
if (str[i] >= '0' && str[i] <= '9') {
relative = '0';
off = 0;
} else if (str[i] >= 'a' && str[i] <= 'f') {
relative = 'a';
off = 10;
} else if (str[i] >= 'A' && str[i] <= 'F') {
relative = 'A';
off = 10;
} else {
fprintf(stderr, "hextocp: '%.*s' is not hexadecimal.\n",
(int)len, str);
return 1;
}
*cp += ((uint_least32_t)1 << (4 * (len - i - 1))) *
(uint_least32_t)(str[i] - relative + off);
}
if (*cp > UINT32_C(0x10FFFF)) {
fprintf(stderr, "hextocp: '%.*s' is too large.\n", (int)len,
str);
return 1;
}
return 0;
}
int
parse_range(const char *str, struct range *range)
{
char *p;
if ((p = strstr(str, "..")) == NULL) {
/* input has the form "XXXXXX" */
if (hextocp(str, strlen(str), &range->lower)) {
return 1;
}
range->upper = range->lower;
} else {
/* input has the form "XXXXXX..XXXXXX" */
if (hextocp(str, (size_t)(p - str), &range->lower) ||
hextocp(p + 2, strlen(p + 2), &range->upper)) {
return 1;
}
}
return 0;
}
static bool
get_line(char **buf, size_t *bufsize, FILE *fp, size_t *len)
{
int ret = EOF;
for (*len = 0;; (*len)++) {
if (*len > 0 && *buf != NULL && (*buf)[*len - 1] == '\n') {
/*
* if the previously read character was a newline,
* we fake an end-of-file so we NUL-terminate and
* are done.
*/
ret = EOF;
} else {
ret = fgetc(fp);
}
if (*len >= *bufsize) {
/* the buffer needs to be expanded */
*bufsize += 512;
if ((*buf = realloc(*buf, *bufsize)) == NULL) {
fprintf(stderr, "get_line: Out of memory.\n");
exit(1);
}
}
if (ret != EOF) {
(*buf)[*len] = (char)ret;
} else {
(*buf)[*len] = '\0';
break;
}
}
return *len == 0 && (feof(fp) || ferror(fp));
}
static char *
duplicate_string(const char *src)
{
size_t len = strlen(src);
char *dest;
if (!(dest = malloc(len + 1))) {
fprintf(stderr, "malloc: %s\n", strerror(errno));
exit(1);
}
memcpy(dest, src, len);
dest[len] = '\0';
return dest;
}
void
duplicate_codepoint_property(const struct codepoint_property *src,
struct codepoint_property *dest)
{
size_t i;
/* copy the field count */
dest->field_count = src->field_count;
/* allocate the field array */
if (!(dest->fields = calloc(dest->field_count, sizeof(*(dest->fields))))) {
fprintf(stderr, "malloc: %s\n", strerror(errno));
exit(1);
}
for (i = 0; i < dest->field_count; i++) {
dest->fields[i] = duplicate_string(src->fields[i]);
}
}
static void
free_codepoint_property(struct codepoint_property *p)
{
size_t i;
for (i = 0; i < p->field_count; i++) {
free(p->fields[i]);
}
free(p->fields);
}
static void
free_codepoint_property_set(struct codepoint_property_set *p)
{
size_t i;
for (i = 0; i < p->property_count; i++) {
free_codepoint_property(&(p->properties[i]));
}
free(p->properties);
}
void
free_codepoint_property_set_array(struct codepoint_property_set *p)
{
uint_least32_t cp;
for (cp = 0; cp < UINT32_C(0x110000); cp++) {
free_codepoint_property_set(&(p[cp]));
}
free(p);
}
const struct codepoint_property *
match_in_codepoint_property_set(const struct codepoint_property_set *p,
const char *str, size_t field_offset)
{
size_t i;
for (i = 0; i < p->property_count; i++) {
/* there must be enough fields to reach the offset */
if (field_offset >= p->properties[i].field_count) {
continue;
}
/* check for a match */
if (strcmp(p->properties[i].fields[field_offset], str) == 0) {
return &(p->properties[i]);
}
}
return NULL;
}
struct codepoint_property_set *
parse_property_file(const char *fname)
{
FILE *fp;
struct codepoint_property p;
struct codepoint_property_set *cpp, *missing;
struct range range;
char *line = NULL, **field = NULL, *comment;
uint_least32_t cp;
size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields, len;
bool is_missing;
/*
* allocate cpp buffer of length 0x11000, initialised to zeros
* (NULL 'properties' pointer, zero property_count)
*/
if (!(cpp = calloc((size_t)UINT32_C(0x110000), sizeof(*cpp)))) {
fprintf(stderr, "calloc: %s\n", strerror(errno));
exit(1);
}
/*
* allocate same-sized array for 'missing' fields. The data files
* have this strange notion of defining some properties in bloody
* comments, and in a way that says 'yeah, if we did not define
* something for some, then replace it with this value'. However,
* it complicates things, as multiple, overlapping @missing
* can be defined in a single file. One can deduce that subsequent
* @missing just overwrite each other, but there's no way to
* track which properties have not been set without running
* through the file multiple times, which we want to avoid, so
* we accumulate the (self-overwriting) @missing set separately
* and fill it in at the end.
*/
if (!(missing = calloc((size_t)UINT32_C(0x110000), sizeof(*missing)))) {
fprintf(stderr, "calloc: %s\n", strerror(errno));
exit(1);
}
/* open the property file */
if (!(fp = fopen(fname, "r"))) {
fprintf(stderr, "parse_property_file: fopen '%s': %s.\n",
fname, strerror(errno));
exit(1);
}
while (!get_line(&line, &linebufsize, fp, &len)) {
/* remove trailing newline */
if (len > 0 && line[len - 1] == '\n') {
line[len - 1] = '\0';
len--;
}
/* skip empty lines */
if (len == 0) {
continue;
}
/* comment, check if it's a @missing */
is_missing = false;
if (line[0] == '#') {
if (strncmp(line + 1, " @missing: ", LEN(" @missing: ") - 1) == 0) {
/*
* this comment specifies a 'missing' line.
* We shift it to the left and treat it
* like any other line, differentiating it
* with the 'is_missing' flag.
*/
size_t offset = 1 + (LEN(" @missing: ") - 1);
memmove(line, line + offset, len - offset + 1);
len -= offset;
is_missing = true;
} else {
/* skip unrelated comment line */
continue;
}
}
/* tokenize line into fields */
for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) {
/* skip leading whitespace */
while (line[i] == ' ') {
i++;
}
/* check if we crashed into the comment */
if (line[i] != '#') {
/* extend field buffer, if necessary */
if (++nfields > fieldbufsize) {
if ((field = realloc(
field,
nfields *
sizeof(*field))) ==
NULL) {
fprintf(stderr,
"parse_property_"
"file: realloc: "
"%s.\n",
strerror(errno));
exit(1);
}
fieldbufsize = nfields;
}
/* set current position as field start */
field[nfields - 1] = &line[i];
/* continue until we reach ';' or '#' or end */
while (line[i] != ';' && line[i] != '#' &&
line[i] != '\0') {
i++;
}
}
if (line[i] == '#') {
/* set comment-variable for later */
comment = &line[i + 1];
}
/* go back whitespace and terminate field there */
if (i > 0) {
for (j = i - 1; line[j] == ' '; j--) {
;
}
line[j + 1] = '\0';
} else {
line[i] = '\0';
}
/* if comment is set, we are done */
if (comment != NULL) {
break;
}
}
/* skip leading whitespace in comment */
while (comment != NULL && comment[0] == ' ') {
comment++;
}
/*
* We now have an array 'field' with 'nfields'. We can
* follow from the file format that, if nfields > 0,
* field[0] specifies a codepoint or range of
* codepoints, which we parse as the first step.
*/
if (nfields < 2) {
/*
* either there is only a range or no range at
* all. Report this
*/
fprintf(stderr, "parse_property_file: malformed "
"line with either no range or no entry\n");
exit(1);
}
if (parse_range(field[0], &range)) {
fprintf(stderr, "parse_property_file: parse_range of "
"'%s' failed.\n", field[0]);
exit(1);
}
/*
* fill a codepoint_property from the remaining
* fields (no allocations on a stack-allocated struct
*/
p.fields = field + 1;
p.field_count = nfields - 1;
/*
* add a duplicate of the filled codepoint_property
* to all codepoints in the range (i.e. the cpp or
* missing array, depending on is_missing)
*/
for (cp = range.lower; cp <= range.upper; cp++) {
if (is_missing) {
/*
* we overwrite the set at the codepoint,
* as the @missing properties overwrite
* each other (bad design)
*/
if (missing[cp].property_count == 0) {
/*
* set is still empty, add space
* for one pointer to a
* codepoint_property
*/
if (!(missing[cp].properties =
malloc(sizeof(*(missing[cp].properties))))) {
fprintf(stderr, "malloc: %s\n",
strerror(errno));
exit(1);
}
missing[cp].property_count = 1;
} else if (missing[cp].property_count == 1) {
/* free the old property */
free_codepoint_property(
&(missing[cp].properties[0]));
} else {
/* this shouldn't happen */
fprintf(stderr, "parse_property_file: "
"malformed missing set\n");
exit(1);
}
/* copy in the new one */
duplicate_codepoint_property(
&p, &(missing[cp].properties[0]));
} else {
/* expand the set by one element */
if (!(cpp[cp].properties = realloc(
cpp[cp].properties,
sizeof(*(cpp[cp].properties)) *
(++cpp[cp].property_count)))) {
fprintf(stderr, "malloc: %s\n",
strerror(errno));
exit(1);
}
duplicate_codepoint_property(&p,
&(cpp[cp].properties[cpp[cp].property_count - 1]));
}
}
}
/*
* now we add the missing elements. We purposefully do not
* follow the interpretation in DerivedCoreProperties.txt for
* the @missing 'InCB; None'. Missing, for us, means that
* _no_ properties have been extracted and thus property_count
* is zero, not that some first field is not set. Absolute
* madness to even publish data like this...
*/
for (cp = 0; cp < UINT32_C(0x110000); cp++) {
if (cpp[cp].property_count == 0) {
/* swap the whole lot */
cpp[cp].properties = missing[cp].properties;
cpp[cp].property_count = missing[cp].property_count;
missing[cp].properties = NULL;
missing[cp].property_count = 0;
}
}
/* close file */
if (fclose(fp)) {
fprintf(stderr, "parse_property_file: fclose '%s': %s.\n",
fname, strerror(errno));
exit(1);
}
/* cleanup */
free_codepoint_property_set_array(missing);
free(line);
free(field);
/* return codepoint properties array */
return cpp;
}
struct compression_output {
uint_least64_t *major;
uint_least64_t *minor;
size_t block_size_shift;
size_t block_size;
size_t major_size;
size_t minor_size;
size_t major_entry_size;
size_t minor_entry_size;
size_t total_size;
};
static void
compress_array(const uint_least64_t *array, size_t block_size_shift,
struct compression_output *co)
{
size_t i, j, major_maximum, minor_maximum;
/* set some preliminary data in the output struct */
co->block_size_shift = block_size_shift;
co->block_size = ((size_t)1) << block_size_shift;
co->major_size = UINT32_C(0x110000) / co->block_size;
/* allocate/initialise */
if (!(co->major = malloc(co->major_size * sizeof(*(co->major))))) {
fprintf(stderr, "malloc: %s\n", strerror(errno));
exit(1);
}
co->minor = NULL;
co->minor_size = 0;
/* iterate over all blocks in the array */
for (i = 0; i < co->major_size; i++) {
size_t block_offset = i * co->block_size;
/*
* iterate over all possible matching block starting
* positions in the minor array
*/
for (j = 0; j + co->block_size < co->minor_size; j++) {
/*
* check if our current block matches the
* candidate block in the minor array
*/
if (memcmp(array + block_offset,
co->minor + j,
co->block_size * sizeof(*array)) == 0) {
/*
* we have a match, so we don't have to
* store this block explicitly and just
* point to the offset in minor
*/
co->major[i] = j;
break;
}
}
if (j + co->block_size >= co->minor_size) {
/*
* we found no matching subblock in minor. Add it
* to the minor array. The index to the first
* element we add now is exactly the size
* of the minor array.
*/
co->major[i] = co->minor_size;
co->minor_size += co->block_size;
if (!(co->minor = realloc(co->minor, co->minor_size *
sizeof(*(co->minor))))) {
fprintf(stderr, "malloc: %s\n",
strerror(errno));
exit(1);
}
memcpy(co->minor + co->minor_size - co->block_size,
array + block_offset,
co->block_size * sizeof(*array));
}
}
/* the compression is done. Now we derive some metadata */
/* determine maxima */
for (i = 0, major_maximum = 0; i < co->major_size; i++) {
if (co->major[i] > major_maximum) {
major_maximum = co->major[i];
}
}
for (i = 0, minor_maximum = 0; i < co->minor_size; i++) {
if (co->minor[i] > minor_maximum) {
minor_maximum = co->minor[i];
}
}
/* determine entry sizes */
if (major_maximum < UINT64_C(1) << 8) {
co->major_entry_size = 8;
} else if (major_maximum < UINT64_C(1) << 16) {
co->major_entry_size = 16;
} else if (major_maximum < UINT64_C(1) << 32) {
co->major_entry_size = 32;
} else {
co->major_entry_size = 64;
}
if (minor_maximum < UINT64_C(1) << 4) {
/* using 4 bit packed nibbles */
co->minor_entry_size = 4;
} else if (minor_maximum < UINT64_C(1) << 8) {
co->minor_entry_size = 8;
} else if (minor_maximum < UINT64_C(1) << 16) {
co->minor_entry_size = 16;
} else if (minor_maximum < UINT64_C(1) << 32) {
co->minor_entry_size = 32;
} else {
co->minor_entry_size = 64;
}
/* total data size in bytes */
co->total_size = ((co->major_size * co->major_entry_size) +
(co->minor_size * co->minor_entry_size)) / 8;
}
void
free_compressed_data(struct compression_output *co)
{
free(co->major);
free(co->minor);
memset(co, 0, sizeof(*co));
}
void
print_array(const uint_least64_t *array, size_t array_size,
size_t array_entry_size, const char *prefix,
const char *name)
{
size_t i;
if (array_entry_size == 4) {
/* we store two 4-bit nibbles in one byte */
if (array_size % 2 != 0) {
/* ensure array lenght is even */
fprintf(stderr, "print_array: 4-bit array "
"is not implemented for odd length.");
exit(1);
}
printf("static const uint_least8_t %s_%s[] = {",
prefix, name);
for (i = 0; i < array_size; i += 2) {
if ((i / 2) % 8 == 0) {
printf("\n\t");
} else {
printf(" ");
}
/* write high and low nibbles */
printf("%zu", (array[i] << 4) | array[i + 1]);
if (i + 2 < array_size) {
printf(",");
}
}
printf("\n};\n");
} else {
printf("static const uint_least%zu_t %s_%s[] = {",
array_entry_size, prefix, name);
for (i = 0; i < array_size; i++) {
if (i % 8 == 0) {
printf("\n\t");
} else {
printf(" ");
}
printf("%zu", array[i]);
if (i + 1 < array_size) {
printf(",");
}
}
printf("\n};\n");
}
}
void
compress_and_output(uint_least64_t *array, const char *prefix)
{
struct compression_output co, co_best;
size_t i, j, dictionary_size, dictionary_entry_size;
uint_least64_t maximum = 0, *dictionary, *keys;
/* initialise dictionary */
dictionary = NULL;
dictionary_size = 0;
/* initialise keys */
if (!(keys = calloc(UINT32_C(0x110000), sizeof(*keys)))) {
fprintf(stderr, "calloc: %s\n", strerror(errno));
exit(1);
}
for (i = 0; i < UINT32_C(0x110000); i++) {
/* maximum determination */
if (array[i] > maximum) {
maximum = array[i];
}
/* check if the array value is already in the dictionary */
for (j = 0; j < dictionary_size; j++) {
if (array[i] == dictionary[j]) {
break;
}
}
if (j == dictionary_size) {
/* not in the dictionary, insert the array value */
if (!(dictionary = realloc(dictionary, (++dictionary_size) *
sizeof(*dictionary)))) {
fprintf(stderr, "realloc: %s\n", strerror(errno));
exit(1);
}
dictionary[dictionary_size - 1] = array[i];
}
/* set the index (key) of the matched dictionary entry */
keys[i] = j;
}
/* check if dictionary size is below a reasonable threshold */
if (dictionary_size > 256) {
fprintf(stderr, "compress_and_output: dictionary too large\n");
exit(1);
}
/*
* compress keys array with different block size shifts
* (block sizes from 16=1<<4 to 32768=1<<15)
*/
memset(&co_best, 0, sizeof(co_best));
co_best.total_size = SIZE_MAX;
for (i = 15; i >= 4; i--) {
compress_array(keys, i, &co);
fprintf(stderr, "compress_and_output: "
"block size %zu -> data size %.1f kB (%zu,%zu)\n",
((size_t)1) << i, (double)co.total_size / 1000, co.major_entry_size, co.minor_entry_size);
if (co.total_size < co_best.total_size) {
/* we have a new best compression */
free_compressed_data(&co_best);
co_best = co;
} else {
/* not optimal, discard it */
free_compressed_data(&co);
}
}
fprintf(stderr, "compress_and_output: choosing optimal block size %zu (%zu,%zu)\n",
co_best.block_size, co_best.major_entry_size, co_best.minor_entry_size);
/* output header */
printf("/* Automatically generated by gen2/%s */\n"
"#include <stdint.h>\n\n"
"#include \"character.h\"\n\n", prefix);
/* output dictionary */
if (maximum < UINT64_C(1) << 8) {
dictionary_entry_size = 8;
} else if (maximum < UINT64_C(1) << 16) {
dictionary_entry_size = 16;
} else if (maximum < UINT64_C(1) << 32) {
dictionary_entry_size = 32;
} else {
dictionary_entry_size = 64;
}
print_array(dictionary, dictionary_size, dictionary_entry_size,
prefix, "dictionary");
printf("\n");
/* output major array */
print_array(co_best.major, co_best.major_size,
co_best.major_entry_size, prefix, "major");
printf("\n");
/* output minor array */
print_array(co_best.minor, co_best.minor_size,
co_best.minor_entry_size, prefix, "minor");
printf("\n");
/* output accessor function (major is never 4-bits per entry) */
printf("static inline uint_least%zu_t\n"
"get_%s_property(uint_least32_t cp)\n"
"{\n"
"\tuint_least%zu_t minor_offset = %s_major[cp >> %zu]\n"
"\t\t+ (cp & ((UINT32_C(1) << %zu) - 1));\n",
dictionary_entry_size, prefix, co_best.major_entry_size,
prefix, co_best.block_size_shift, co_best.block_size_shift);
if (co_best.minor_entry_size == 4) {
printf("\tuint_least8_t packed_value = %s_minor[minor_offset / 2];\n\n"
"\tif (minor_offset & UINT8_C(1) == 0) {\n"
"\t\t/* high nibble */\n"
"\t\treturn %s_dictionary[packed_value >> 4];\n"
"\t} else {\n"
"\t\t/* low nibble */\n"
"\t\treturn %s_dictionary[packed_value & UINT8_C(0x0f)];\n"
"\t}\n",
prefix, prefix, prefix);
} else {
printf("\n\treturn %s_dictionary[%s_minor[minor_offset]];\n",
prefix, prefix);
}
printf("}\n");
/* cleanup */
free(dictionary);
free(keys);
}