blob: c9f3501d6930f1c8dde07d60b89eabeb95f97dd4 [file] [log] [blame]
/* See LICENSE file for copyright and license details. */
#include <errno.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "util.h"
#define FILE_DCP "data/DerivedCoreProperties.txt"
static const struct property_spec case_property[] = {
{
.enumname = "OTHER",
.file = NULL,
.ucdname = NULL,
},
{
.enumname = "BOTH_CASED_CASE_IGNORABLE",
.file = NULL,
.ucdname = NULL,
},
{
.enumname = "CASED",
.file = FILE_DCP,
.ucdname = "Cased",
},
{
.enumname = "CASE_IGNORABLE",
.file = FILE_DCP,
.ucdname = "Case_Ignorable",
},
{
.enumname = "UNCASED",
.file = FILE_DCP,
.ucdname = "Uncased",
},
};
static uint_least8_t
handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
{
uint_least8_t result;
(void)cp;
if ((!strcmp(case_property[prop1].enumname, "CASED") &&
!strcmp(case_property[prop2].enumname, "CASE_IGNORABLE")) ||
(!strcmp(case_property[prop1].enumname, "CASE_IGNORABLE") &&
!strcmp(case_property[prop2].enumname, "CASED"))) {
for (result = 0; result < LEN(case_property); result++) {
if (!strcmp(case_property[result].enumname,
"BOTH_CASED_CASE_IGNORABLE")) {
break;
}
}
if (result == LEN(case_property)) {
fprintf(stderr, "handle_conflict: Internal error.\n");
exit(1);
}
} else {
fprintf(stderr, "handle_conflict: Cannot handle conflict.\n");
exit(1);
}
return result;
}
static struct properties *prop_upper = NULL, *prop_lower, *prop_title;
static struct special_case {
struct {
uint_least32_t *cp;
size_t cplen;
} upper, lower, title;
} *sc = NULL;
static size_t sclen = 0;
static int
unicodedata_callback(const char *file, char **field, size_t nfields,
char *comment, void *payload)
{
uint_least32_t cp, upper, lower, title;
(void)file;
(void)comment;
(void)payload;
hextocp(field[0], strlen(field[0]), &cp);
upper = lower = title = cp;
if ((strlen(field[12]) > 0 &&
hextocp(field[12], strlen(field[12]), &upper)) ||
(strlen(field[13]) > 0 &&
hextocp(field[13], strlen(field[13]), &lower)) ||
(nfields >= 15 && strlen(field[14]) > 0 &&
hextocp(field[14], strlen(field[14]), &title))) {
return 1;
}
prop_upper[cp].property = (int_least32_t)upper - (int_least32_t)cp;
prop_lower[cp].property = (int_least32_t)lower - (int_least32_t)cp;
prop_title[cp].property = (int_least32_t)title - (int_least32_t)cp;
return 0;
}
static int
specialcasing_callback(const char *file, char **field, size_t nfields,
char *comment, void *payload)
{
uint_least32_t cp;
(void)file;
(void)comment;
(void)payload;
if (nfields > 4 && strlen(field[4]) > 0) {
/*
* we have more than 4 fields, i.e. the rule has a
* condition (language-sensitive, etc.) and is discarded
*/
return 0;
}
/* parse affected codepoint */
hextocp(field[0], strlen(field[0]), &cp);
/* extend special case array */
if (!(sc = realloc(sc, (++sclen) * sizeof(*sc)))) {
fprintf(stderr, "realloc: %s\n", strerror(errno));
exit(1);
}
/* parse field data */
parse_cp_list(field[3], &(sc[sclen - 1].upper.cp),
&(sc[sclen - 1].upper.cplen));
parse_cp_list(field[1], &(sc[sclen - 1].lower.cp),
&(sc[sclen - 1].lower.cplen));
parse_cp_list(field[2], &(sc[sclen - 1].title.cp),
&(sc[sclen - 1].title.cplen));
/*
* overwrite value in "single mapping" property table by the
* special value 0x110000 + (offset in special case array),
* even if the special case has length 1
*/
prop_upper[cp].property =
(int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
prop_lower[cp].property =
(int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
prop_title[cp].property =
(int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
return 0;
}
static int_least64_t
get_value(const struct properties *prop, size_t offset)
{
return prop[offset].property;
}
int
main(int argc, char *argv[])
{
struct properties_compressed comp_upper, comp_lower, comp_title;
struct properties_major_minor mm_upper, mm_lower, mm_title;
size_t i, j;
(void)argc;
/* generate case property table from the specification */
properties_generate_break_property(case_property, LEN(case_property),
NULL, handle_conflict, NULL, "case",
argv[0]);
/*
* allocate property buffers for all 0x110000 codepoints
*
* the buffers contain the offset from the "base" character
* to the respective case mapping. By callocing we set all fields
* to zero, which is also the Unicode "default" in the sense that
* there is no case mapping by default (unless we fill it in)
*/
if (!(prop_upper = calloc(UINT32_C(0x110000), sizeof(*prop_upper))) ||
!(prop_lower = calloc(UINT32_C(0x110000), sizeof(*prop_lower))) ||
!(prop_title = calloc(UINT32_C(0x110000), sizeof(*prop_title)))) {
fprintf(stderr, "calloc: %s\n", strerror(errno));
exit(1);
}
parse_file_with_callback("data/UnicodeData.txt", unicodedata_callback,
NULL);
parse_file_with_callback("data/SpecialCasing.txt",
specialcasing_callback, NULL);
/* compress properties */
properties_compress(prop_upper, &comp_upper);
properties_compress(prop_lower, &comp_lower);
properties_compress(prop_title, &comp_title);
fprintf(stderr,
"%s: LUT compression-ratios: upper=%.2f%%, lower=%.2f%%, "
"title=%.2f%%\n",
argv[0], properties_get_major_minor(&comp_upper, &mm_upper),
properties_get_major_minor(&comp_lower, &mm_lower),
properties_get_major_minor(&comp_title, &mm_title));
/* print tables */
printf("/* Automatically generated by %s */\n#include "
"<stdint.h>\n#include <stddef.h>\n\n",
argv[0]);
printf("struct special_case {\n\tuint_least32_t *cp;\n\tsize_t "
"cplen;\n};\n\n");
properties_print_lookup_table("upper_major", mm_upper.major, 0x1100);
printf("\n");
properties_print_derived_lookup_table("upper_minor", mm_upper.minor,
mm_upper.minorlen, get_value,
comp_upper.data);
printf("\n");
properties_print_lookup_table("lower_major", mm_lower.major, 0x1100);
printf("\n");
properties_print_derived_lookup_table("lower_minor", mm_lower.minor,
mm_lower.minorlen, get_value,
comp_lower.data);
printf("\n");
properties_print_lookup_table("title_major", mm_title.major, 0x1100);
printf("\n");
properties_print_derived_lookup_table("title_minor", mm_title.minor,
mm_title.minorlen, get_value,
comp_title.data);
printf("\n");
printf("static const struct special_case upper_special[] = {\n");
for (i = 0; i < sclen; i++) {
printf("\t{\n");
printf("\t\t.cp = (uint_least32_t[]){");
for (j = 0; j < sc[i].upper.cplen; j++) {
printf(" UINT32_C(0x%06X)", sc[i].upper.cp[j]);
if (j + 1 < sc[i].upper.cplen) {
putchar(',');
}
}
printf(" },\n");
printf("\t\t.cplen = %zu,\n", sc[i].upper.cplen);
printf("\t},\n");
}
printf("};\n\n");
printf("static const struct special_case lower_special[] = {\n");
for (i = 0; i < sclen; i++) {
printf("\t{\n");
printf("\t\t.cp = (uint_least32_t[]){");
for (j = 0; j < sc[i].lower.cplen; j++) {
printf(" UINT32_C(0x%06X)", sc[i].lower.cp[j]);
if (j + 1 < sc[i].lower.cplen) {
putchar(',');
}
}
printf(" },\n");
printf("\t\t.cplen = %zu,\n", sc[i].lower.cplen);
printf("\t},\n");
}
printf("};\n\n");
printf("static const struct special_case title_special[] = {\n");
for (i = 0; i < sclen; i++) {
printf("\t{\n");
printf("\t\t.cp = (uint_least32_t[]){");
for (j = 0; j < sc[i].title.cplen; j++) {
printf(" UINT32_C(0x%06X)", sc[i].title.cp[j]);
if (j + 1 < sc[i].title.cplen) {
putchar(',');
}
}
printf(" },\n");
printf("\t\t.cplen = %zu,\n", sc[i].title.cplen);
printf("\t},\n");
}
printf("};\n\n");
free(comp_lower.data);
free(comp_lower.offset);
free(comp_title.data);
free(comp_title.offset);
free(comp_upper.data);
free(comp_upper.offset);
free(mm_lower.major);
free(mm_lower.minor);
free(mm_title.major);
free(mm_title.minor);
free(mm_upper.major);
free(mm_upper.minor);
return 0;
}