blob: b4a4fb712d37675306bf0f8bf18c54cf39894d85 [file] [log] [blame]
/* See LICENSE file for copyright and license details. */
#include <errno.h>
#include <inttypes.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "util.h"
#define FILE_BIDI_BRACKETS "data/BidiBrackets.txt"
#define FILE_BIDI_CLASS "data/DerivedBidiClass.txt"
#define FILE_BIDI_MIRRORING "data/BidiMirroring.txt"
#define FILE_UNICODE_DATA "data/UnicodeData.txt"
#define NUM_BRACKET_ALIASES 20
static const struct property_spec bidi_property[] = {
{
/* default */
.enumname = "L",
.file = FILE_BIDI_CLASS,
.ucdname = "L",
},
{
.enumname = "AL",
.file = FILE_BIDI_CLASS,
.ucdname = "AL",
},
{
.enumname = "AN",
.file = FILE_BIDI_CLASS,
.ucdname = "AN",
},
{
.enumname = "B",
.file = FILE_BIDI_CLASS,
.ucdname = "B",
},
{
.enumname = "BN",
.file = FILE_BIDI_CLASS,
.ucdname = "BN",
},
{
.enumname = "CS",
.file = FILE_BIDI_CLASS,
.ucdname = "CS",
},
{
.enumname = "EN",
.file = FILE_BIDI_CLASS,
.ucdname = "EN",
},
{
.enumname = "ES",
.file = FILE_BIDI_CLASS,
.ucdname = "ES",
},
{
.enumname = "ET",
.file = FILE_BIDI_CLASS,
.ucdname = "ET",
},
{
.enumname = "FSI",
.file = FILE_BIDI_CLASS,
.ucdname = "FSI",
},
{
.enumname = "LRE",
.file = FILE_BIDI_CLASS,
.ucdname = "LRE",
},
{
.enumname = "LRI",
.file = FILE_BIDI_CLASS,
.ucdname = "LRI",
},
{
.enumname = "LRO",
.file = FILE_BIDI_CLASS,
.ucdname = "LRO",
},
{
.enumname = "NSM",
.file = FILE_BIDI_CLASS,
.ucdname = "NSM",
},
{
.enumname = "ON",
.file = FILE_BIDI_CLASS,
.ucdname = "ON",
},
{
.enumname = "PDF",
.file = FILE_BIDI_CLASS,
.ucdname = "PDF",
},
{
.enumname = "PDI",
.file = FILE_BIDI_CLASS,
.ucdname = "PDI",
},
{
.enumname = "R",
.file = FILE_BIDI_CLASS,
.ucdname = "R",
},
{
.enumname = "RLE",
.file = FILE_BIDI_CLASS,
.ucdname = "RLE",
},
{
.enumname = "RLI",
.file = FILE_BIDI_CLASS,
.ucdname = "RLI",
},
{
.enumname = "RLO",
.file = FILE_BIDI_CLASS,
.ucdname = "RLO",
},
{
.enumname = "S",
.file = FILE_BIDI_CLASS,
.ucdname = "S",
},
{
.enumname = "WS",
.file = FILE_BIDI_CLASS,
.ucdname = "WS",
},
};
struct decomposition_payload {
uint_least32_t cp;
uint_least32_t decomposition;
};
static int
decomposition_callback(const char *file, char **field, size_t nfields,
char *comment, void *payload)
{
char *p;
struct decomposition_payload *decomp =
(struct decomposition_payload *)payload;
uint_least32_t cp;
(void)file;
(void)comment;
if (nfields < 6) {
/* we have fewer than 6 fields, discard the line */
return 0;
}
hextocp(field[0], strlen(field[0]), &cp);
if (decomp->cp == cp) {
/* we hit the line that contains our decomposition target */
if (strlen(field[5]) > 0) {
p = field[5];
if (*p == '<') {
/*
* the decomposition contains some metadata
* <...> we skip
*/
for (; *p != '\0'; p++) {
if (*p == '>') {
p++;
while (*p == ' ') {
p++;
}
break;
}
}
}
hextocp(p, strlen(p), &(decomp->decomposition));
} else {
decomp->decomposition = decomp->cp;
}
}
return 0;
}
static struct {
uint_least32_t base[NUM_BRACKET_ALIASES];
size_t baselen;
uint_least32_t pair[NUM_BRACKET_ALIASES];
size_t pairlen;
uint_least8_t class;
char type;
} *b = NULL;
static size_t blen;
static uint_least8_t bracket_class_count = 1;
static int
bracket_callback(const char *file, char **field, size_t nfields, char *comment,
void *payload)
{
size_t i, j;
struct decomposition_payload decomp_base, decomp_pair;
uint_least32_t cp_base, cp_pair;
(void)file;
(void)comment;
(void)payload;
if (nfields < 3) {
/* we have fewer than 3 fields, discard the line */
return 0;
}
/* parse field data */
hextocp(field[0], strlen(field[0]), &cp_base);
hextocp(field[1], strlen(field[1]), &cp_pair);
/* determine decomposition of the base and pair codepoints */
decomp_base.cp = cp_base;
decomp_pair.cp = cp_pair;
parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callback,
&decomp_base);
parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callback,
&decomp_pair);
/*
* check if we already have the canonical form in the bracket array,
* per convention the canonical form is the first element of the alias
* array
*/
for (i = 0; i < blen; i++) {
if (decomp_base.decomposition == b[i].base[0]) {
/* we have a match, check type */
if (strlen(field[2]) != 1 ||
(field[2][0] != 'o' && field[2][0] != 'c')) {
/* malformed line */
return 1;
} else if (b[i].type != field[2][0]) {
/* mismatching types */
return 1;
}
/*
* add our base alias to the base array unless it isn't
* already in it
*/
for (j = 0; j < b[i].baselen; j++) {
if (cp_base == b[i].base[j]) {
/* already in array, do nothing */
break;
}
}
if (j == b[i].baselen) {
/*
* the base alias is not already in the array,
* add it
*/
if (b[i].baselen == NUM_BRACKET_ALIASES) {
fprintf(stderr, "too many aliases\n");
return 1;
}
b[i].baselen++;
b[i].base[b[i].baselen - 1] = cp_base;
}
/*
* also add our pair alias to the pair array unless
* it isn't already in it
*/
for (j = 0; j < b[i].pairlen; j++) {
if (cp_pair == b[i].pair[j]) {
/* already in array, do nothing */
break;
}
}
if (j == b[i].pairlen) {
/*
* the pair alias is not already in the array,
* add it
*/
if (b[i].pairlen == NUM_BRACKET_ALIASES) {
fprintf(stderr, "too many aliases\n");
return 1;
}
b[i].pairlen++;
b[i].pair[b[i].pairlen - 1] = cp_pair;
}
return 0;
}
}
/* extend bracket pair array, as this is a new bracket type */
if (!(b = realloc(b, (++blen) * sizeof(*b)))) {
fprintf(stderr, "realloc: %s\n", strerror(errno));
exit(1);
}
/* fill field data by adding the canonical form first */
b[blen - 1].base[0] = decomp_base.decomposition;
b[blen - 1].baselen = 1;
b[blen - 1].pair[0] = decomp_pair.decomposition;
b[blen - 1].pairlen = 1;
/* add alias if it differs from the canonical form */
if (cp_base != decomp_base.decomposition) {
b[blen - 1].base[1] = cp_base;
b[blen - 1].baselen = 2;
}
if (cp_pair != decomp_pair.decomposition) {
b[blen - 1].pair[1] = cp_pair;
b[blen - 1].pairlen = 2;
}
/* add bracket type */
if (strlen(field[2]) != 1 ||
(field[2][0] != 'o' && field[2][0] != 'c')) {
/* malformed line */
return 1;
} else {
b[blen - 1].type = field[2][0];
}
/*
* determine bracket class by iterating over the bracket-array
* and seeing if our current canonical cp already has a matching pair.
* We only need to check the first entry in each bracket alias
* list, as this is, per convention, the canonical form.
* If not, add a new class.
*/
for (i = 0; i + 1 < blen; i++) {
if (b[i].pair[0] == b[blen - 1].base[0]) {
/* matched class */
b[blen - 1].class = b[i].class;
break;
}
}
if (i + 1 == blen) {
/* no match, assign a new class */
b[blen - 1].class = bracket_class_count++;
}
return 0;
}
static void
post_process(struct properties *prop)
{
size_t i, j;
for (i = 0; i < blen; i++) {
/*
* given the base property fits in 5 bits, we simply
* store the bracket-offset in the bits above that.
*
* All those properties that are not set here implicitly
* have offset 0, which we prepared to contain a stub
* for a character that is not a bracket.
*/
for (j = 0; j < b[i].baselen; j++) {
prop[b[i].base[j]].property |= (i << 5);
}
}
}
static uint_least8_t
fill_missing(uint_least32_t cp)
{
/* based on the @missing-properties in data/DerivedBidiClass.txt */
if ((cp >= UINT32_C(0x0590) && cp <= UINT32_C(0x05FF)) ||
(cp >= UINT32_C(0x07C0) && cp <= UINT32_C(0x085F)) ||
(cp >= UINT32_C(0xFB1D) && cp <= UINT32_C(0xFB4F)) ||
(cp >= UINT32_C(0x10800) && cp <= UINT32_C(0x10CFF)) ||
(cp >= UINT32_C(0x10D40) && cp <= UINT32_C(0x10EBF)) ||
(cp >= UINT32_C(0x10F00) && cp <= UINT32_C(0x10F2F)) ||
(cp >= UINT32_C(0x10F70) && cp <= UINT32_C(0x10FFF)) ||
(cp >= UINT32_C(0x1E800) && cp <= UINT32_C(0x1EC6F)) ||
(cp >= UINT32_C(0x1ECC0) && cp <= UINT32_C(0x1ECFF)) ||
(cp >= UINT32_C(0x1ED50) && cp <= UINT32_C(0x1EDFF)) ||
(cp >= UINT32_C(0x1EF00) && cp <= UINT32_C(0x1EFFF))) {
return 17; /* class R */
} else if ((cp >= UINT32_C(0x0600) && cp <= UINT32_C(0x07BF)) ||
(cp >= UINT32_C(0x0860) && cp <= UINT32_C(0x08FF)) ||
(cp >= UINT32_C(0xFB50) && cp <= UINT32_C(0xFDCF)) ||
(cp >= UINT32_C(0xFDF0) && cp <= UINT32_C(0xFDFF)) ||
(cp >= UINT32_C(0xFE70) && cp <= UINT32_C(0xFEFF)) ||
(cp >= UINT32_C(0x10D00) && cp <= UINT32_C(0x10D3F)) ||
(cp >= UINT32_C(0x10EC0) && cp <= UINT32_C(0x10EFF)) ||
(cp >= UINT32_C(0x10F30) && cp <= UINT32_C(0x10F6F)) ||
(cp >= UINT32_C(0x1EC70) && cp <= UINT32_C(0x1ECBF)) ||
(cp >= UINT32_C(0x1ED00) && cp <= UINT32_C(0x1ED4F)) ||
(cp >= UINT32_C(0x1EE00) && cp <= UINT32_C(0x1EEFF))) {
return 1; /* class AL */
} else if (cp >= UINT32_C(0x20A0) && cp <= UINT32_C(0x20CF)) {
return 8; /* class ET */
} else {
return 0; /* class L */
}
}
static struct properties *prop_mirror = NULL;
static int
mirror_callback(const char *file, char **field, size_t nfields, char *comment,
void *payload)
{
uint_least32_t cp, cp_mirror;
(void)file;
(void)comment;
(void)payload;
hextocp(field[0], strlen(field[0]), &cp);
cp_mirror = cp;
if (nfields >= 2 && strlen(field[1]) > 0 &&
hextocp(field[1], strlen(field[1]), &cp_mirror)) {
return 1;
}
prop_mirror[cp].property = (int_least32_t)cp_mirror - (int_least32_t)cp;
return 0;
}
static int_least64_t
get_value(const struct properties *prop, size_t offset)
{
return prop[offset].property;
}
int
main(int argc, char *argv[])
{
struct properties_compressed comp_mirror;
struct properties_major_minor mm_mirror;
size_t i;
(void)argc;
/*
* the first element in the bracket array is initialized to
* all-zeros, as we use the implicit 0-offset for all those
* codepoints that are not a bracket
*/
if (!(b = calloc((blen = 1), sizeof(*b)))) {
fprintf(stderr, "calloc: %s\n", strerror(errno));
exit(1);
}
parse_file_with_callback(FILE_BIDI_BRACKETS, bracket_callback, NULL);
properties_generate_break_property(bidi_property, LEN(bidi_property),
fill_missing, NULL, post_process,
"bidi", argv[0]);
printf("\nenum bracket_type {\n\tBIDI_BRACKET_NONE,\n\t"
"BIDI_BRACKET_OPEN,\n\tBIDI_BRACKET_CLOSE,\n};\n\n"
"static const struct bracket {\n\tenum bracket_type type;\n"
"\tuint_least8_t class;\n} bidi_bracket[] = {\n");
for (i = 0; i < blen; i++) {
printf("\t{\n\t\t.type = %s,\n\t\t.class = "
"%" PRIuLEAST8 ",\n\t},\n",
(b[i].type == 'o') ? "BIDI_BRACKET_OPEN" :
(b[i].type == 'c') ? "BIDI_BRACKET_CLOSE" :
"BIDI_BRACKET_NONE",
b[i].class);
}
printf("};\n");
/*
* allocate property buffer for all 0x110000 codepoints
*
* the buffers contain the offset from the "base" character
* to the respective mirrored character. By callocing we set all
* fields to zero, which is also the Unicode "default" in the sense
* that the coe point is its mirror (unless we fill it in)
*/
if (!(prop_mirror = calloc(UINT32_C(0x110000), sizeof(*prop_mirror)))) {
fprintf(stderr, "calloc: %s\n", strerror(errno));
exit(1);
}
parse_file_with_callback(FILE_BIDI_MIRRORING, mirror_callback, NULL);
/* compress properties */
properties_compress(prop_mirror, &comp_mirror);
fprintf(stderr, "%s: mirror-LUT compression-ratio: %.2f%%\n", argv[0],
properties_get_major_minor(&comp_mirror, &mm_mirror));
/* print tables */
properties_print_lookup_table("mirror_major", mm_mirror.major, 0x1100);
printf("\n");
properties_print_derived_lookup_table("mirror_minor", mm_mirror.minor,
mm_mirror.minorlen, get_value,
comp_mirror.data);
free(comp_mirror.data);
free(comp_mirror.offset);
free(mm_mirror.major);
free(mm_mirror.minor);
return 0;
}