| /* See LICENSE file for copyright and license details. */ |
| #include <ctype.h> |
| #include <errno.h> |
| #include <inttypes.h> |
| #include <stdbool.h> |
| #include <stddef.h> |
| #include <stdint.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include "util.h" |
| |
| struct range { |
| uint_least32_t lower; |
| uint_least32_t upper; |
| }; |
| |
| static int |
| hextocp(const char *str, size_t len, uint_least32_t *cp) |
| { |
| size_t i; |
| int off; |
| char relative; |
| |
| /* the maximum valid codepoint is 0x10FFFF */ |
| if (len > 6) { |
| fprintf(stderr, "hextocp: '%.*s' is too long.\n", (int)len, |
| str); |
| return 1; |
| } |
| |
| for (i = 0, *cp = 0; i < len; i++) { |
| if (str[i] >= '0' && str[i] <= '9') { |
| relative = '0'; |
| off = 0; |
| } else if (str[i] >= 'a' && str[i] <= 'f') { |
| relative = 'a'; |
| off = 10; |
| } else if (str[i] >= 'A' && str[i] <= 'F') { |
| relative = 'A'; |
| off = 10; |
| } else { |
| fprintf(stderr, "hextocp: '%.*s' is not hexadecimal.\n", |
| (int)len, str); |
| return 1; |
| } |
| |
| *cp += ((uint_least32_t)1 << (4 * (len - i - 1))) * |
| (uint_least32_t)(str[i] - relative + off); |
| } |
| |
| if (*cp > UINT32_C(0x10FFFF)) { |
| fprintf(stderr, "hextocp: '%.*s' is too large.\n", (int)len, |
| str); |
| return 1; |
| } |
| |
| return 0; |
| } |
| |
| int |
| parse_range(const char *str, struct range *range) |
| { |
| char *p; |
| |
| if ((p = strstr(str, "..")) == NULL) { |
| /* input has the form "XXXXXX" */ |
| if (hextocp(str, strlen(str), &range->lower)) { |
| return 1; |
| } |
| range->upper = range->lower; |
| } else { |
| /* input has the form "XXXXXX..XXXXXX" */ |
| if (hextocp(str, (size_t)(p - str), &range->lower) || |
| hextocp(p + 2, strlen(p + 2), &range->upper)) { |
| return 1; |
| } |
| } |
| |
| return 0; |
| } |
| |
| static bool |
| get_line(char **buf, size_t *bufsize, FILE *fp, size_t *len) |
| { |
| int ret = EOF; |
| |
| for (*len = 0;; (*len)++) { |
| if (*len > 0 && *buf != NULL && (*buf)[*len - 1] == '\n') { |
| /* |
| * if the previously read character was a newline, |
| * we fake an end-of-file so we NUL-terminate and |
| * are done. |
| */ |
| ret = EOF; |
| } else { |
| ret = fgetc(fp); |
| } |
| |
| if (*len >= *bufsize) { |
| /* the buffer needs to be expanded */ |
| *bufsize += 512; |
| if ((*buf = realloc(*buf, *bufsize)) == NULL) { |
| fprintf(stderr, "get_line: Out of memory.\n"); |
| exit(1); |
| } |
| } |
| |
| if (ret != EOF) { |
| (*buf)[*len] = (char)ret; |
| } else { |
| (*buf)[*len] = '\0'; |
| break; |
| } |
| } |
| |
| return *len == 0 && (feof(fp) || ferror(fp)); |
| } |
| |
| static char * |
| duplicate_string(const char *src) |
| { |
| size_t len = strlen(src); |
| char *dest; |
| |
| if (!(dest = malloc(len + 1))) { |
| fprintf(stderr, "malloc: %s\n", strerror(errno)); |
| exit(1); |
| } |
| |
| memcpy(dest, src, len); |
| dest[len] = '\0'; |
| |
| return dest; |
| } |
| |
| void |
| duplicate_codepoint_property(const struct codepoint_property *src, |
| struct codepoint_property *dest) |
| { |
| size_t i; |
| |
| /* copy the field count */ |
| dest->field_count = src->field_count; |
| |
| /* allocate the field array */ |
| if (!(dest->fields = calloc(dest->field_count, sizeof(*(dest->fields))))) { |
| fprintf(stderr, "malloc: %s\n", strerror(errno)); |
| exit(1); |
| } |
| |
| for (i = 0; i < dest->field_count; i++) { |
| dest->fields[i] = duplicate_string(src->fields[i]); |
| } |
| } |
| |
| static void |
| free_codepoint_property(struct codepoint_property *p) |
| { |
| size_t i; |
| |
| for (i = 0; i < p->field_count; i++) { |
| free(p->fields[i]); |
| } |
| free(p->fields); |
| } |
| |
| static void |
| free_codepoint_property_set(struct codepoint_property_set *p) |
| { |
| size_t i; |
| |
| for (i = 0; i < p->property_count; i++) { |
| free_codepoint_property(&(p->properties[i])); |
| } |
| free(p->properties); |
| } |
| |
| void |
| free_codepoint_property_set_array(struct codepoint_property_set *p) |
| { |
| uint_least32_t cp; |
| |
| for (cp = 0; cp < UINT32_C(0x110000); cp++) { |
| free_codepoint_property_set(&(p[cp])); |
| } |
| free(p); |
| } |
| |
| const struct codepoint_property * |
| match_in_codepoint_property_set(const struct codepoint_property_set *p, |
| const char *str, size_t field_offset) |
| { |
| size_t i; |
| |
| for (i = 0; i < p->property_count; i++) { |
| /* there must be enough fields to reach the offset */ |
| if (field_offset >= p->properties[i].field_count) { |
| continue; |
| } |
| |
| /* check for a match */ |
| if (strcmp(p->properties[i].fields[field_offset], str) == 0) { |
| return &(p->properties[i]); |
| } |
| } |
| |
| return NULL; |
| } |
| |
| struct codepoint_property_set * |
| parse_property_file(const char *fname) |
| { |
| FILE *fp; |
| struct codepoint_property p; |
| struct codepoint_property_set *cpp, *missing; |
| struct range range; |
| char *line = NULL, **field = NULL, *comment; |
| uint_least32_t cp; |
| size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields, len; |
| bool is_missing; |
| |
| /* |
| * allocate cpp buffer of length 0x11000, initialised to zeros |
| * (NULL 'properties' pointer, zero property_count) |
| */ |
| if (!(cpp = calloc((size_t)UINT32_C(0x110000), sizeof(*cpp)))) { |
| fprintf(stderr, "calloc: %s\n", strerror(errno)); |
| exit(1); |
| } |
| |
| /* |
| * allocate same-sized array for 'missing' fields. The data files |
| * have this strange notion of defining some properties in bloody |
| * comments, and in a way that says 'yeah, if we did not define |
| * something for some, then replace it with this value'. However, |
| * it complicates things, as multiple, overlapping @missing |
| * can be defined in a single file. One can deduce that subsequent |
| * @missing just overwrite each other, but there's no way to |
| * track which properties have not been set without running |
| * through the file multiple times, which we want to avoid, so |
| * we accumulate the (self-overwriting) @missing set separately |
| * and fill it in at the end. |
| */ |
| if (!(missing = calloc((size_t)UINT32_C(0x110000), sizeof(*missing)))) { |
| fprintf(stderr, "calloc: %s\n", strerror(errno)); |
| exit(1); |
| } |
| |
| /* open the property file */ |
| if (!(fp = fopen(fname, "r"))) { |
| fprintf(stderr, "parse_property_file: fopen '%s': %s.\n", |
| fname, strerror(errno)); |
| exit(1); |
| } |
| |
| while (!get_line(&line, &linebufsize, fp, &len)) { |
| /* remove trailing newline */ |
| if (len > 0 && line[len - 1] == '\n') { |
| line[len - 1] = '\0'; |
| len--; |
| } |
| |
| /* skip empty lines */ |
| if (len == 0) { |
| continue; |
| } |
| |
| /* comment, check if it's a @missing */ |
| is_missing = false; |
| if (line[0] == '#') { |
| if (strncmp(line + 1, " @missing: ", LEN(" @missing: ") - 1) == 0) { |
| /* |
| * this comment specifies a 'missing' line. |
| * We shift it to the left and treat it |
| * like any other line, differentiating it |
| * with the 'is_missing' flag. |
| */ |
| size_t offset = 1 + (LEN(" @missing: ") - 1); |
| |
| memmove(line, line + offset, len - offset + 1); |
| len -= offset; |
| is_missing = true; |
| } else { |
| /* skip unrelated comment line */ |
| continue; |
| } |
| } |
| |
| /* tokenize line into fields */ |
| for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) { |
| /* skip leading whitespace */ |
| while (line[i] == ' ') { |
| i++; |
| } |
| |
| /* check if we crashed into the comment */ |
| if (line[i] != '#') { |
| /* extend field buffer, if necessary */ |
| if (++nfields > fieldbufsize) { |
| if ((field = realloc( |
| field, |
| nfields * |
| sizeof(*field))) == |
| NULL) { |
| fprintf(stderr, |
| "parse_property_" |
| "file: realloc: " |
| "%s.\n", |
| strerror(errno)); |
| exit(1); |
| } |
| fieldbufsize = nfields; |
| } |
| |
| /* set current position as field start */ |
| field[nfields - 1] = &line[i]; |
| |
| /* continue until we reach ';' or '#' or end */ |
| while (line[i] != ';' && line[i] != '#' && |
| line[i] != '\0') { |
| i++; |
| } |
| } |
| |
| if (line[i] == '#') { |
| /* set comment-variable for later */ |
| comment = &line[i + 1]; |
| } |
| |
| /* go back whitespace and terminate field there */ |
| if (i > 0) { |
| for (j = i - 1; line[j] == ' '; j--) { |
| ; |
| } |
| line[j + 1] = '\0'; |
| } else { |
| line[i] = '\0'; |
| } |
| |
| /* if comment is set, we are done */ |
| if (comment != NULL) { |
| break; |
| } |
| } |
| |
| /* skip leading whitespace in comment */ |
| while (comment != NULL && comment[0] == ' ') { |
| comment++; |
| } |
| |
| /* |
| * We now have an array 'field' with 'nfields'. We can |
| * follow from the file format that, if nfields > 0, |
| * field[0] specifies a codepoint or range of |
| * codepoints, which we parse as the first step. |
| */ |
| if (nfields < 2) { |
| /* |
| * either there is only a range or no range at |
| * all. Report this |
| */ |
| fprintf(stderr, "parse_property_file: malformed " |
| "line with either no range or no entry\n"); |
| exit(1); |
| } |
| |
| if (parse_range(field[0], &range)) { |
| fprintf(stderr, "parse_property_file: parse_range of " |
| "'%s' failed.\n", field[0]); |
| exit(1); |
| } |
| |
| /* |
| * fill a codepoint_property from the remaining |
| * fields (no allocations on a stack-allocated struct |
| */ |
| p.fields = field + 1; |
| p.field_count = nfields - 1; |
| |
| /* |
| * add a duplicate of the filled codepoint_property |
| * to all codepoints in the range (i.e. the cpp or |
| * missing array, depending on is_missing) |
| */ |
| for (cp = range.lower; cp <= range.upper; cp++) { |
| if (is_missing) { |
| /* |
| * we overwrite the set at the codepoint, |
| * as the @missing properties overwrite |
| * each other (bad design) |
| */ |
| if (missing[cp].property_count == 0) { |
| /* |
| * set is still empty, add space |
| * for one pointer to a |
| * codepoint_property |
| */ |
| if (!(missing[cp].properties = |
| malloc(sizeof(*(missing[cp].properties))))) { |
| fprintf(stderr, "malloc: %s\n", |
| strerror(errno)); |
| exit(1); |
| } |
| missing[cp].property_count = 1; |
| } else if (missing[cp].property_count == 1) { |
| /* free the old property */ |
| free_codepoint_property( |
| &(missing[cp].properties[0])); |
| } else { |
| /* this shouldn't happen */ |
| fprintf(stderr, "parse_property_file: " |
| "malformed missing set\n"); |
| exit(1); |
| } |
| |
| /* copy in the new one */ |
| duplicate_codepoint_property( |
| &p, &(missing[cp].properties[0])); |
| } else { |
| /* expand the set by one element */ |
| if (!(cpp[cp].properties = realloc( |
| cpp[cp].properties, |
| sizeof(*(cpp[cp].properties)) * |
| (++cpp[cp].property_count)))) { |
| fprintf(stderr, "malloc: %s\n", |
| strerror(errno)); |
| exit(1); |
| } |
| |
| duplicate_codepoint_property(&p, |
| &(cpp[cp].properties[cpp[cp].property_count - 1])); |
| } |
| } |
| } |
| |
| /* |
| * now we add the missing elements. We purposefully do not |
| * follow the interpretation in DerivedCoreProperties.txt for |
| * the @missing 'InCB; None'. Missing, for us, means that |
| * _no_ properties have been extracted and thus property_count |
| * is zero, not that some first field is not set. Absolute |
| * madness to even publish data like this... |
| */ |
| for (cp = 0; cp < UINT32_C(0x110000); cp++) { |
| if (cpp[cp].property_count == 0) { |
| /* swap the whole lot */ |
| cpp[cp].properties = missing[cp].properties; |
| cpp[cp].property_count = missing[cp].property_count; |
| missing[cp].properties = NULL; |
| missing[cp].property_count = 0; |
| } |
| } |
| |
| /* close file */ |
| if (fclose(fp)) { |
| fprintf(stderr, "parse_property_file: fclose '%s': %s.\n", |
| fname, strerror(errno)); |
| exit(1); |
| } |
| |
| /* cleanup */ |
| free_codepoint_property_set_array(missing); |
| free(line); |
| free(field); |
| |
| /* return codepoint properties array */ |
| return cpp; |
| } |
| |
| struct compression_output { |
| uint_least64_t *major; |
| uint_least64_t *minor; |
| size_t block_size_shift; |
| size_t block_size; |
| size_t major_size; |
| size_t minor_size; |
| size_t major_entry_size; |
| size_t minor_entry_size; |
| size_t total_size; |
| }; |
| |
| static void |
| compress_array(const uint_least64_t *array, size_t block_size_shift, |
| struct compression_output *co) |
| { |
| size_t i, j, major_maximum, minor_maximum; |
| |
| /* set some preliminary data in the output struct */ |
| co->block_size_shift = block_size_shift; |
| co->block_size = ((size_t)1) << block_size_shift; |
| co->major_size = UINT32_C(0x110000) / co->block_size; |
| |
| /* allocate/initialise */ |
| if (!(co->major = malloc(co->major_size * sizeof(*(co->major))))) { |
| fprintf(stderr, "malloc: %s\n", strerror(errno)); |
| exit(1); |
| } |
| co->minor = NULL; |
| co->minor_size = 0; |
| |
| /* iterate over all blocks in the array */ |
| for (i = 0; i < co->major_size; i++) { |
| size_t block_offset = i * co->block_size; |
| |
| /* |
| * iterate over all possible matching block starting |
| * positions in the minor array |
| */ |
| for (j = 0; j + co->block_size < co->minor_size; j++) { |
| /* |
| * check if our current block matches the |
| * candidate block in the minor array |
| */ |
| if (memcmp(array + block_offset, |
| co->minor + j, |
| co->block_size * sizeof(*array)) == 0) { |
| /* |
| * we have a match, so we don't have to |
| * store this block explicitly and just |
| * point to the offset in minor |
| */ |
| co->major[i] = j; |
| break; |
| } |
| } |
| if (j + co->block_size >= co->minor_size) { |
| /* |
| * we found no matching subblock in minor. Add it |
| * to the minor array. The index to the first |
| * element we add now is exactly the size |
| * of the minor array. |
| */ |
| co->major[i] = co->minor_size; |
| co->minor_size += co->block_size; |
| if (!(co->minor = realloc(co->minor, co->minor_size * |
| sizeof(*(co->minor))))) { |
| fprintf(stderr, "malloc: %s\n", |
| strerror(errno)); |
| exit(1); |
| } |
| memcpy(co->minor + co->minor_size - co->block_size, |
| array + block_offset, |
| co->block_size * sizeof(*array)); |
| } |
| } |
| |
| /* the compression is done. Now we derive some metadata */ |
| |
| /* determine maxima */ |
| for (i = 0, major_maximum = 0; i < co->major_size; i++) { |
| if (co->major[i] > major_maximum) { |
| major_maximum = co->major[i]; |
| } |
| } |
| for (i = 0, minor_maximum = 0; i < co->minor_size; i++) { |
| if (co->minor[i] > minor_maximum) { |
| minor_maximum = co->minor[i]; |
| } |
| } |
| |
| /* determine entry sizes */ |
| if (major_maximum < UINT64_C(1) << 8) { |
| co->major_entry_size = 8; |
| } else if (major_maximum < UINT64_C(1) << 16) { |
| co->major_entry_size = 16; |
| } else if (major_maximum < UINT64_C(1) << 32) { |
| co->major_entry_size = 32; |
| } else { |
| co->major_entry_size = 64; |
| } |
| |
| if (minor_maximum < UINT64_C(1) << 4) { |
| /* using 4 bit packed nibbles */ |
| co->minor_entry_size = 4; |
| } else if (minor_maximum < UINT64_C(1) << 8) { |
| co->minor_entry_size = 8; |
| } else if (minor_maximum < UINT64_C(1) << 16) { |
| co->minor_entry_size = 16; |
| } else if (minor_maximum < UINT64_C(1) << 32) { |
| co->minor_entry_size = 32; |
| } else { |
| co->minor_entry_size = 64; |
| } |
| |
| /* total data size in bytes */ |
| co->total_size = ((co->major_size * co->major_entry_size) + |
| (co->minor_size * co->minor_entry_size)) / 8; |
| } |
| |
| void |
| free_compressed_data(struct compression_output *co) |
| { |
| free(co->major); |
| free(co->minor); |
| memset(co, 0, sizeof(*co)); |
| } |
| |
| void |
| print_array(const uint_least64_t *array, size_t array_size, |
| size_t array_entry_size, const char *prefix, |
| const char *name) |
| { |
| size_t i; |
| |
| if (array_entry_size == 4) { |
| /* we store two 4-bit nibbles in one byte */ |
| if (array_size % 2 != 0) { |
| /* ensure array lenght is even */ |
| fprintf(stderr, "print_array: 4-bit array " |
| "is not implemented for odd length."); |
| exit(1); |
| } |
| |
| printf("static const uint_least8_t %s_%s[] = {", |
| prefix, name); |
| for (i = 0; i < array_size; i += 2) { |
| if ((i / 2) % 8 == 0) { |
| printf("\n\t"); |
| } else { |
| printf(" "); |
| } |
| |
| /* write high and low nibbles */ |
| printf("%zu", (array[i] << 4) | array[i + 1]); |
| |
| if (i + 2 < array_size) { |
| printf(","); |
| } |
| } |
| printf("\n};\n"); |
| } else { |
| printf("static const uint_least%zu_t %s_%s[] = {", |
| array_entry_size, prefix, name); |
| for (i = 0; i < array_size; i++) { |
| if (i % 8 == 0) { |
| printf("\n\t"); |
| } else { |
| printf(" "); |
| } |
| printf("%zu", array[i]); |
| if (i + 1 < array_size) { |
| printf(","); |
| } |
| } |
| printf("\n};\n"); |
| } |
| } |
| |
| void |
| compress_and_output(uint_least64_t *array, const char *prefix) |
| { |
| struct compression_output co, co_best; |
| size_t i, j, dictionary_size, dictionary_entry_size; |
| uint_least64_t maximum = 0, *dictionary, *keys; |
| |
| /* initialise dictionary */ |
| dictionary = NULL; |
| dictionary_size = 0; |
| |
| /* initialise keys */ |
| if (!(keys = calloc(UINT32_C(0x110000), sizeof(*keys)))) { |
| fprintf(stderr, "calloc: %s\n", strerror(errno)); |
| exit(1); |
| } |
| |
| for (i = 0; i < UINT32_C(0x110000); i++) { |
| /* maximum determination */ |
| if (array[i] > maximum) { |
| maximum = array[i]; |
| } |
| |
| /* check if the array value is already in the dictionary */ |
| for (j = 0; j < dictionary_size; j++) { |
| if (array[i] == dictionary[j]) { |
| break; |
| } |
| } |
| if (j == dictionary_size) { |
| /* not in the dictionary, insert the array value */ |
| if (!(dictionary = realloc(dictionary, (++dictionary_size) * |
| sizeof(*dictionary)))) { |
| fprintf(stderr, "realloc: %s\n", strerror(errno)); |
| exit(1); |
| } |
| dictionary[dictionary_size - 1] = array[i]; |
| } |
| |
| /* set the index (key) of the matched dictionary entry */ |
| keys[i] = j; |
| } |
| |
| /* check if dictionary size is below a reasonable threshold */ |
| if (dictionary_size > 256) { |
| fprintf(stderr, "compress_and_output: dictionary too large\n"); |
| exit(1); |
| } |
| |
| /* |
| * compress keys array with different block size shifts |
| * (block sizes from 16=1<<4 to 32768=1<<15) |
| */ |
| memset(&co_best, 0, sizeof(co_best)); |
| co_best.total_size = SIZE_MAX; |
| for (i = 15; i >= 4; i--) { |
| compress_array(keys, i, &co); |
| |
| fprintf(stderr, "compress_and_output: " |
| "block size %zu -> data size %.1f kB (%zu,%zu)\n", |
| ((size_t)1) << i, (double)co.total_size / 1000, co.major_entry_size, co.minor_entry_size); |
| |
| if (co.total_size < co_best.total_size) { |
| /* we have a new best compression */ |
| free_compressed_data(&co_best); |
| co_best = co; |
| } else { |
| /* not optimal, discard it */ |
| free_compressed_data(&co); |
| } |
| } |
| fprintf(stderr, "compress_and_output: choosing optimal block size %zu (%zu,%zu)\n", |
| co_best.block_size, co_best.major_entry_size, co_best.minor_entry_size); |
| |
| /* output header */ |
| printf("/* Automatically generated by gen2/%s */\n" |
| "#include <stdint.h>\n\n" |
| "#include \"character.h\"\n\n", prefix); |
| |
| /* output dictionary */ |
| if (maximum < UINT64_C(1) << 8) { |
| dictionary_entry_size = 8; |
| } else if (maximum < UINT64_C(1) << 16) { |
| dictionary_entry_size = 16; |
| } else if (maximum < UINT64_C(1) << 32) { |
| dictionary_entry_size = 32; |
| } else { |
| dictionary_entry_size = 64; |
| } |
| |
| print_array(dictionary, dictionary_size, dictionary_entry_size, |
| prefix, "dictionary"); |
| printf("\n"); |
| |
| /* output major array */ |
| print_array(co_best.major, co_best.major_size, |
| co_best.major_entry_size, prefix, "major"); |
| printf("\n"); |
| |
| /* output minor array */ |
| print_array(co_best.minor, co_best.minor_size, |
| co_best.minor_entry_size, prefix, "minor"); |
| printf("\n"); |
| |
| /* output accessor function (major is never 4-bits per entry) */ |
| printf("static inline uint_least%zu_t\n" |
| "get_%s_property(uint_least32_t cp)\n" |
| "{\n" |
| "\tuint_least%zu_t minor_offset = %s_major[cp >> %zu]\n" |
| "\t\t+ (cp & ((UINT32_C(1) << %zu) - 1));\n", |
| dictionary_entry_size, prefix, co_best.major_entry_size, |
| prefix, co_best.block_size_shift, co_best.block_size_shift); |
| if (co_best.minor_entry_size == 4) { |
| printf("\tuint_least8_t packed_value = %s_minor[minor_offset / 2];\n\n" |
| "\tif (minor_offset & UINT8_C(1) == 0) {\n" |
| "\t\t/* high nibble */\n" |
| "\t\treturn %s_dictionary[packed_value >> 4];\n" |
| "\t} else {\n" |
| "\t\t/* low nibble */\n" |
| "\t\treturn %s_dictionary[packed_value & UINT8_C(0x0f)];\n" |
| "\t}\n", |
| prefix, prefix, prefix); |
| } else { |
| printf("\n\treturn %s_dictionary[%s_minor[minor_offset]];\n", |
| prefix, prefix); |
| } |
| printf("}\n"); |
| |
| /* cleanup */ |
| free(dictionary); |
| free(keys); |
| } |