blob: 2e08acca1f9fa66716edca154824dd6f6c701def [file] [log] [blame]
/* See LICENSE file for copyright and license details. */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "util.h"
#define FILE_EAW "data/EastAsianWidth.txt"
#define FILE_EMOJI "data/emoji-data.txt"
#define FILE_LINE "data/LineBreak.txt"
static const struct property_spec line_break_property[] = {
{
.enumname = "AL",
.file = FILE_LINE,
.ucdname = "AL",
},
/*
* Both extended pictographic and cn are large classes,
* but we are only interested in their intersection for LB30b,
* so we have the following two temporary classes. At first
* the extpict-class is filled, then the cn-class, which leads
* to conflicts (that we handle by putting them in the "proper"
* class BOTH_CN_EXTPICT). We make use of the fact that there
* is no intersection between AL and Cn.
*
* Any consecutive conflicts are permitted to overwrite
* TMP_EXTENDED_PICTOGRAPHIC and TMP_CN, because we don't need
* them, and in the final postprocessing we "reset" all
* remaining matches (that then didn't fit any of the other
* classes) to the generic class AL.
*/
{
.enumname = "TMP_CN",
.file = FILE_LINE,
.ucdname = "Cn",
},
{
.enumname = "TMP_EXTENDED_PICTOGRAPHIC",
.file = FILE_EMOJI,
.ucdname = "Extended_Pictographic",
},
/* end of special block */
{
.enumname = "B2",
.file = FILE_LINE,
.ucdname = "B2",
},
{
.enumname = "BA",
.file = FILE_LINE,
.ucdname = "BA",
},
{
.enumname = "BB",
.file = FILE_LINE,
.ucdname = "BB",
},
{
.enumname = "BK",
.file = FILE_LINE,
.ucdname = "BK",
},
{
.enumname = "BOTH_CN_EXTPICT",
.file = NULL,
.ucdname = NULL,
},
{
.enumname = "CB",
.file = FILE_LINE,
.ucdname = "CB",
},
{
.enumname = "CL",
.file = FILE_LINE,
.ucdname = "CL",
},
{
.enumname = "CM",
.file = FILE_LINE,
.ucdname = "CM",
},
{
.enumname = "CP_WITHOUT_EAW_HWF",
.file = FILE_LINE,
.ucdname = "CP",
},
{
.enumname = "CP_WITH_EAW_HWF",
.file = NULL,
.ucdname = NULL,
},
{
.enumname = "CR",
.file = FILE_LINE,
.ucdname = "CR",
},
{
.enumname = "EB",
.file = FILE_LINE,
.ucdname = "EB",
},
{
.enumname = "EM",
.file = FILE_LINE,
.ucdname = "EM",
},
{
.enumname = "EX",
.file = FILE_LINE,
.ucdname = "EX",
},
{
.enumname = "GL",
.file = FILE_LINE,
.ucdname = "GL",
},
{
.enumname = "H2",
.file = FILE_LINE,
.ucdname = "H2",
},
{
.enumname = "H3",
.file = FILE_LINE,
.ucdname = "H3",
},
{
.enumname = "HL",
.file = FILE_LINE,
.ucdname = "HL",
},
{
.enumname = "HY",
.file = FILE_LINE,
.ucdname = "HY",
},
{
.enumname = "ID",
.file = FILE_LINE,
.ucdname = "ID",
},
{
.enumname = "IN",
.file = FILE_LINE,
.ucdname = "IN",
},
{
.enumname = "IS",
.file = FILE_LINE,
.ucdname = "IS",
},
{
.enumname = "JL",
.file = FILE_LINE,
.ucdname = "JL",
},
{
.enumname = "JT",
.file = FILE_LINE,
.ucdname = "JT",
},
{
.enumname = "JV",
.file = FILE_LINE,
.ucdname = "JV",
},
{
.enumname = "LF",
.file = FILE_LINE,
.ucdname = "LF",
},
{
.enumname = "NL",
.file = FILE_LINE,
.ucdname = "NL",
},
{
.enumname = "NS",
.file = FILE_LINE,
.ucdname = "NS",
},
{
.enumname = "NU",
.file = FILE_LINE,
.ucdname = "NU",
},
{
.enumname = "OP_WITHOUT_EAW_HWF",
.file = FILE_LINE,
.ucdname = "OP",
},
{
.enumname = "OP_WITH_EAW_HWF",
.file = NULL,
.ucdname = NULL,
},
{
.enumname = "PO",
.file = FILE_LINE,
.ucdname = "PO",
},
{
.enumname = "PR",
.file = FILE_LINE,
.ucdname = "PR",
},
{
.enumname = "QU",
.file = FILE_LINE,
.ucdname = "QU",
},
{
.enumname = "RI",
.file = FILE_LINE,
.ucdname = "RI",
},
{
.enumname = "SP",
.file = FILE_LINE,
.ucdname = "SP",
},
{
.enumname = "SY",
.file = FILE_LINE,
.ucdname = "SY",
},
{
.enumname = "WJ",
.file = FILE_LINE,
.ucdname = "WJ",
},
{
.enumname = "ZW",
.file = FILE_LINE,
.ucdname = "ZW",
},
{
.enumname = "ZWJ",
.file = FILE_LINE,
.ucdname = "ZWJ",
},
{
.enumname = "TMP_AI",
.file = FILE_LINE,
.ucdname = "AI",
},
{
.enumname = "TMP_CJ",
.file = FILE_LINE,
.ucdname = "CJ",
},
{
.enumname = "TMP_XX",
.file = NULL,
.ucdname = NULL,
},
{
.enumname = "TMP_MN",
.file = FILE_LINE,
.ucdname = "Mn",
},
{
.enumname = "TMP_MC",
.file = FILE_LINE,
.ucdname = "Mc",
},
{
.enumname = "TMP_SA_WITHOUT_MN_OR_MC",
.file = FILE_LINE,
.ucdname = "SA",
},
{
.enumname = "TMP_SA_WITH_MN_OR_MC",
.file = FILE_LINE,
.ucdname = "SA",
},
{
.enumname = "TMP_SG",
.file = FILE_LINE,
.ucdname = "SG",
},
{
.enumname = "TMP_EAW_H",
.file = FILE_EAW,
.ucdname = "H",
},
{
.enumname = "TMP_EAW_W",
.file = FILE_EAW,
.ucdname = "W",
},
{
.enumname = "TMP_EAW_F",
.file = FILE_EAW,
.ucdname = "F",
},
};
static uint_least8_t
handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
{
uint_least8_t result = prop2;
char *target = NULL;
(void)cp;
if ((!strcmp(line_break_property[prop1].enumname, "TMP_EAW_H") ||
!strcmp(line_break_property[prop1].enumname, "TMP_EAW_W") ||
!strcmp(line_break_property[prop1].enumname, "TMP_EAW_F")) ||
(!strcmp(line_break_property[prop2].enumname, "TMP_EAW_H") ||
!strcmp(line_break_property[prop2].enumname, "TMP_EAW_W") ||
!strcmp(line_break_property[prop2].enumname, "TMP_EAW_F"))) {
if (!strcmp(line_break_property[prop1].enumname,
"CP_WITHOUT_EAW_HWF") ||
!strcmp(line_break_property[prop2].enumname,
"CP_WITHOUT_EAW_HWF")) {
target = "CP_WITH_EAW_HWF";
} else if (!strcmp(line_break_property[prop1].enumname,
"OP_WITHOUT_EAW_HWF") ||
!strcmp(line_break_property[prop2].enumname,
"OP_WITHOUT_EAW_HWF")) {
target = "OP_WITH_EAW_HWF";
} else {
/* ignore EAW for the rest */
if ((!strcmp(line_break_property[prop1].enumname,
"TMP_EAW_H") ||
!strcmp(line_break_property[prop1].enumname,
"TMP_EAW_W") ||
!strcmp(line_break_property[prop1].enumname,
"TMP_EAW_F"))) {
result = prop2;
} else {
result = prop1;
}
}
} else if ((!strcmp(line_break_property[prop1].enumname, "TMP_MN") ||
!strcmp(line_break_property[prop1].enumname, "TMP_MC")) ||
(!strcmp(line_break_property[prop2].enumname, "TMP_MN") ||
!strcmp(line_break_property[prop2].enumname, "TMP_MC"))) {
if (!strcmp(line_break_property[prop1].enumname,
"SA_WITHOUT_MN_OR_MC") ||
!strcmp(line_break_property[prop2].enumname,
"SA_WITHOUT_MN_OR_MC")) {
target = "SA_WITH_MN_OR_MC";
} else {
/* ignore Mn and Mc for the rest */
if ((!strcmp(line_break_property[prop1].enumname,
"TMP_MN") ||
!strcmp(line_break_property[prop1].enumname,
"TMP_MC"))) {
result = prop2;
} else {
result = prop1;
}
}
} else if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") ||
!strcmp(line_break_property[prop2].enumname, "TMP_CN")) {
if (!strcmp(line_break_property[prop1].enumname,
"TMP_EXTENDED_PICTOGRAPHIC") ||
!strcmp(line_break_property[prop2].enumname,
"TMP_EXTENDED_PICTOGRAPHIC")) {
target = "BOTH_CN_EXTPICT";
} else {
/* ignore Cn for all the other properties */
if (!strcmp(line_break_property[prop1].enumname,
"TMP_CN")) {
result = prop2;
} else {
result = prop1;
}
}
} else if (!strcmp(line_break_property[prop1].enumname,
"TMP_EXTENDED_PICTOGRAPHIC") ||
!strcmp(line_break_property[prop2].enumname,
"TMP_EXTENDED_PICTOGRAPHIC")) {
if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") ||
!strcmp(line_break_property[prop2].enumname, "TMP_CN")) {
target = "BOTH_CN_EXTPICT";
} else {
/* ignore Extended_Pictographic for all the other
* properties */
if (!strcmp(line_break_property[prop1].enumname,
"TMP_EXTENDED_PICTOGRAPHIC")) {
result = prop2;
} else {
result = prop1;
}
}
} else {
fprintf(stderr,
"handle_conflict: Cannot handle conflict %s <- %s.\n",
line_break_property[prop1].enumname,
line_break_property[prop2].enumname);
exit(1);
}
if (target) {
for (result = 0; result < LEN(line_break_property); result++) {
if (!strcmp(line_break_property[result].enumname,
target)) {
break;
}
}
if (result == LEN(line_break_property)) {
fprintf(stderr, "handle_conflict: Internal error.\n");
exit(1);
}
}
return result;
}
static void
post_process(struct properties *prop)
{
const char *target;
uint_least8_t result;
size_t i;
/* post-mapping according to the line breaking algorithm */
for (i = 0; i < UINT32_C(0x110000); i++) {
/* LB1 */
if (!strcmp(line_break_property[prop[i].property].enumname,
"TMP_AI") ||
!strcmp(line_break_property[prop[i].property].enumname,
"TMP_SG") ||
!strcmp(line_break_property[prop[i].property].enumname,
"TMP_XX")) {
/* map AI, SG and XX to AL */
target = "AL";
} else if (!strcmp(line_break_property[prop[i].property]
.enumname,
"TMP_SA_WITH_MN_OR_MC")) {
/* map SA (with General_Category Mn or Mc) to CM */
target = "CM";
} else if (!strcmp(line_break_property[prop[i].property]
.enumname,
"TMP_SA_WITHOUT_MN_OR_MC")) {
/* map SA (without General_Category Mn or Mc) to AL */
target = "AL";
} else if (!strcmp(line_break_property[prop[i].property]
.enumname,
"TMP_CJ")) {
/* map CJ to NS */
target = "NS";
} else if (
!strcmp(line_break_property[prop[i].property].enumname,
"TMP_CN") ||
!strcmp(line_break_property[prop[i].property].enumname,
"TMP_EXTENDED_PICTOGRAPHIC") ||
!strcmp(line_break_property[prop[i].property].enumname,
"TMP_MN") ||
!strcmp(line_break_property[prop[i].property].enumname,
"TMP_MC") ||
!strcmp(line_break_property[prop[i].property].enumname,
"TMP_EAW_H") ||
!strcmp(line_break_property[prop[i].property].enumname,
"TMP_EAW_W") ||
!strcmp(line_break_property[prop[i].property].enumname,
"TMP_EAW_F")) {
/* map all the temporary classes "residue" to AL */
target = "AL";
} else {
target = NULL;
}
if (target) {
for (result = 0; result < LEN(line_break_property);
result++) {
if (!strcmp(line_break_property[result]
.enumname,
target)) {
break;
}
}
if (result == LEN(line_break_property)) {
fprintf(stderr,
"handle_conflict: Internal error.\n");
exit(1);
}
prop[i].property = result;
}
}
}
int
main(int argc, char *argv[])
{
(void)argc;
properties_generate_break_property(
line_break_property, LEN(line_break_property), NULL,
handle_conflict, post_process, "line_break", argv[0]);
return 0;
}