| /* See LICENSE file for copyright and license details. */ |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include "util.h" |
| |
| #define FILE_EAW "data/EastAsianWidth.txt" |
| #define FILE_EMOJI "data/emoji-data.txt" |
| #define FILE_LINE "data/LineBreak.txt" |
| |
| static const struct property_spec line_break_property[] = { |
| { |
| .enumname = "AL", |
| .file = FILE_LINE, |
| .ucdname = "AL", |
| }, |
| /* |
| * Both extended pictographic and cn are large classes, |
| * but we are only interested in their intersection for LB30b, |
| * so we have the following two temporary classes. At first |
| * the extpict-class is filled, then the cn-class, which leads |
| * to conflicts (that we handle by putting them in the "proper" |
| * class BOTH_CN_EXTPICT). We make use of the fact that there |
| * is no intersection between AL and Cn. |
| * |
| * Any consecutive conflicts are permitted to overwrite |
| * TMP_EXTENDED_PICTOGRAPHIC and TMP_CN, because we don't need |
| * them, and in the final postprocessing we "reset" all |
| * remaining matches (that then didn't fit any of the other |
| * classes) to the generic class AL. |
| */ |
| { |
| .enumname = "TMP_CN", |
| .file = FILE_LINE, |
| .ucdname = "Cn", |
| }, |
| { |
| .enumname = "TMP_EXTENDED_PICTOGRAPHIC", |
| .file = FILE_EMOJI, |
| .ucdname = "Extended_Pictographic", |
| }, |
| /* end of special block */ |
| { |
| .enumname = "B2", |
| .file = FILE_LINE, |
| .ucdname = "B2", |
| }, |
| { |
| .enumname = "BA", |
| .file = FILE_LINE, |
| .ucdname = "BA", |
| }, |
| { |
| .enumname = "BB", |
| .file = FILE_LINE, |
| .ucdname = "BB", |
| }, |
| { |
| .enumname = "BK", |
| .file = FILE_LINE, |
| .ucdname = "BK", |
| }, |
| { |
| .enumname = "BOTH_CN_EXTPICT", |
| .file = NULL, |
| .ucdname = NULL, |
| }, |
| { |
| .enumname = "CB", |
| .file = FILE_LINE, |
| .ucdname = "CB", |
| }, |
| { |
| .enumname = "CL", |
| .file = FILE_LINE, |
| .ucdname = "CL", |
| }, |
| { |
| .enumname = "CM", |
| .file = FILE_LINE, |
| .ucdname = "CM", |
| }, |
| { |
| .enumname = "CP_WITHOUT_EAW_HWF", |
| .file = FILE_LINE, |
| .ucdname = "CP", |
| }, |
| { |
| .enumname = "CP_WITH_EAW_HWF", |
| .file = NULL, |
| .ucdname = NULL, |
| }, |
| { |
| .enumname = "CR", |
| .file = FILE_LINE, |
| .ucdname = "CR", |
| }, |
| { |
| .enumname = "EB", |
| .file = FILE_LINE, |
| .ucdname = "EB", |
| }, |
| { |
| .enumname = "EM", |
| .file = FILE_LINE, |
| .ucdname = "EM", |
| }, |
| { |
| .enumname = "EX", |
| .file = FILE_LINE, |
| .ucdname = "EX", |
| }, |
| { |
| .enumname = "GL", |
| .file = FILE_LINE, |
| .ucdname = "GL", |
| }, |
| { |
| .enumname = "H2", |
| .file = FILE_LINE, |
| .ucdname = "H2", |
| }, |
| { |
| .enumname = "H3", |
| .file = FILE_LINE, |
| .ucdname = "H3", |
| }, |
| { |
| .enumname = "HL", |
| .file = FILE_LINE, |
| .ucdname = "HL", |
| }, |
| { |
| .enumname = "HY", |
| .file = FILE_LINE, |
| .ucdname = "HY", |
| }, |
| { |
| .enumname = "ID", |
| .file = FILE_LINE, |
| .ucdname = "ID", |
| }, |
| { |
| .enumname = "IN", |
| .file = FILE_LINE, |
| .ucdname = "IN", |
| }, |
| { |
| .enumname = "IS", |
| .file = FILE_LINE, |
| .ucdname = "IS", |
| }, |
| { |
| .enumname = "JL", |
| .file = FILE_LINE, |
| .ucdname = "JL", |
| }, |
| { |
| .enumname = "JT", |
| .file = FILE_LINE, |
| .ucdname = "JT", |
| }, |
| { |
| .enumname = "JV", |
| .file = FILE_LINE, |
| .ucdname = "JV", |
| }, |
| { |
| .enumname = "LF", |
| .file = FILE_LINE, |
| .ucdname = "LF", |
| }, |
| { |
| .enumname = "NL", |
| .file = FILE_LINE, |
| .ucdname = "NL", |
| }, |
| { |
| .enumname = "NS", |
| .file = FILE_LINE, |
| .ucdname = "NS", |
| }, |
| { |
| .enumname = "NU", |
| .file = FILE_LINE, |
| .ucdname = "NU", |
| }, |
| { |
| .enumname = "OP_WITHOUT_EAW_HWF", |
| .file = FILE_LINE, |
| .ucdname = "OP", |
| }, |
| { |
| .enumname = "OP_WITH_EAW_HWF", |
| .file = NULL, |
| .ucdname = NULL, |
| }, |
| { |
| .enumname = "PO", |
| .file = FILE_LINE, |
| .ucdname = "PO", |
| }, |
| { |
| .enumname = "PR", |
| .file = FILE_LINE, |
| .ucdname = "PR", |
| }, |
| { |
| .enumname = "QU", |
| .file = FILE_LINE, |
| .ucdname = "QU", |
| }, |
| { |
| .enumname = "RI", |
| .file = FILE_LINE, |
| .ucdname = "RI", |
| }, |
| { |
| .enumname = "SP", |
| .file = FILE_LINE, |
| .ucdname = "SP", |
| }, |
| { |
| .enumname = "SY", |
| .file = FILE_LINE, |
| .ucdname = "SY", |
| }, |
| { |
| .enumname = "WJ", |
| .file = FILE_LINE, |
| .ucdname = "WJ", |
| }, |
| { |
| .enumname = "ZW", |
| .file = FILE_LINE, |
| .ucdname = "ZW", |
| }, |
| { |
| .enumname = "ZWJ", |
| .file = FILE_LINE, |
| .ucdname = "ZWJ", |
| }, |
| { |
| .enumname = "TMP_AI", |
| .file = FILE_LINE, |
| .ucdname = "AI", |
| }, |
| { |
| .enumname = "TMP_CJ", |
| .file = FILE_LINE, |
| .ucdname = "CJ", |
| }, |
| { |
| .enumname = "TMP_XX", |
| .file = NULL, |
| .ucdname = NULL, |
| }, |
| { |
| .enumname = "TMP_MN", |
| .file = FILE_LINE, |
| .ucdname = "Mn", |
| }, |
| { |
| .enumname = "TMP_MC", |
| .file = FILE_LINE, |
| .ucdname = "Mc", |
| }, |
| { |
| .enumname = "TMP_SA_WITHOUT_MN_OR_MC", |
| .file = FILE_LINE, |
| .ucdname = "SA", |
| }, |
| { |
| .enumname = "TMP_SA_WITH_MN_OR_MC", |
| .file = FILE_LINE, |
| .ucdname = "SA", |
| }, |
| { |
| .enumname = "TMP_SG", |
| .file = FILE_LINE, |
| .ucdname = "SG", |
| }, |
| { |
| .enumname = "TMP_EAW_H", |
| .file = FILE_EAW, |
| .ucdname = "H", |
| }, |
| { |
| .enumname = "TMP_EAW_W", |
| .file = FILE_EAW, |
| .ucdname = "W", |
| }, |
| { |
| .enumname = "TMP_EAW_F", |
| .file = FILE_EAW, |
| .ucdname = "F", |
| }, |
| }; |
| |
| static uint_least8_t |
| handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2) |
| { |
| uint_least8_t result = prop2; |
| char *target = NULL; |
| |
| (void)cp; |
| |
| if ((!strcmp(line_break_property[prop1].enumname, "TMP_EAW_H") || |
| !strcmp(line_break_property[prop1].enumname, "TMP_EAW_W") || |
| !strcmp(line_break_property[prop1].enumname, "TMP_EAW_F")) || |
| (!strcmp(line_break_property[prop2].enumname, "TMP_EAW_H") || |
| !strcmp(line_break_property[prop2].enumname, "TMP_EAW_W") || |
| !strcmp(line_break_property[prop2].enumname, "TMP_EAW_F"))) { |
| if (!strcmp(line_break_property[prop1].enumname, |
| "CP_WITHOUT_EAW_HWF") || |
| !strcmp(line_break_property[prop2].enumname, |
| "CP_WITHOUT_EAW_HWF")) { |
| target = "CP_WITH_EAW_HWF"; |
| } else if (!strcmp(line_break_property[prop1].enumname, |
| "OP_WITHOUT_EAW_HWF") || |
| !strcmp(line_break_property[prop2].enumname, |
| "OP_WITHOUT_EAW_HWF")) { |
| target = "OP_WITH_EAW_HWF"; |
| } else { |
| /* ignore EAW for the rest */ |
| if ((!strcmp(line_break_property[prop1].enumname, |
| "TMP_EAW_H") || |
| !strcmp(line_break_property[prop1].enumname, |
| "TMP_EAW_W") || |
| !strcmp(line_break_property[prop1].enumname, |
| "TMP_EAW_F"))) { |
| result = prop2; |
| } else { |
| result = prop1; |
| } |
| } |
| } else if ((!strcmp(line_break_property[prop1].enumname, "TMP_MN") || |
| !strcmp(line_break_property[prop1].enumname, "TMP_MC")) || |
| (!strcmp(line_break_property[prop2].enumname, "TMP_MN") || |
| !strcmp(line_break_property[prop2].enumname, "TMP_MC"))) { |
| if (!strcmp(line_break_property[prop1].enumname, |
| "SA_WITHOUT_MN_OR_MC") || |
| !strcmp(line_break_property[prop2].enumname, |
| "SA_WITHOUT_MN_OR_MC")) { |
| target = "SA_WITH_MN_OR_MC"; |
| } else { |
| /* ignore Mn and Mc for the rest */ |
| if ((!strcmp(line_break_property[prop1].enumname, |
| "TMP_MN") || |
| !strcmp(line_break_property[prop1].enumname, |
| "TMP_MC"))) { |
| result = prop2; |
| } else { |
| result = prop1; |
| } |
| } |
| } else if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") || |
| !strcmp(line_break_property[prop2].enumname, "TMP_CN")) { |
| if (!strcmp(line_break_property[prop1].enumname, |
| "TMP_EXTENDED_PICTOGRAPHIC") || |
| !strcmp(line_break_property[prop2].enumname, |
| "TMP_EXTENDED_PICTOGRAPHIC")) { |
| target = "BOTH_CN_EXTPICT"; |
| } else { |
| /* ignore Cn for all the other properties */ |
| if (!strcmp(line_break_property[prop1].enumname, |
| "TMP_CN")) { |
| result = prop2; |
| } else { |
| result = prop1; |
| } |
| } |
| } else if (!strcmp(line_break_property[prop1].enumname, |
| "TMP_EXTENDED_PICTOGRAPHIC") || |
| !strcmp(line_break_property[prop2].enumname, |
| "TMP_EXTENDED_PICTOGRAPHIC")) { |
| if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") || |
| !strcmp(line_break_property[prop2].enumname, "TMP_CN")) { |
| target = "BOTH_CN_EXTPICT"; |
| } else { |
| /* ignore Extended_Pictographic for all the other |
| * properties */ |
| if (!strcmp(line_break_property[prop1].enumname, |
| "TMP_EXTENDED_PICTOGRAPHIC")) { |
| result = prop2; |
| } else { |
| result = prop1; |
| } |
| } |
| } else { |
| fprintf(stderr, |
| "handle_conflict: Cannot handle conflict %s <- %s.\n", |
| line_break_property[prop1].enumname, |
| line_break_property[prop2].enumname); |
| exit(1); |
| } |
| |
| if (target) { |
| for (result = 0; result < LEN(line_break_property); result++) { |
| if (!strcmp(line_break_property[result].enumname, |
| target)) { |
| break; |
| } |
| } |
| if (result == LEN(line_break_property)) { |
| fprintf(stderr, "handle_conflict: Internal error.\n"); |
| exit(1); |
| } |
| } |
| |
| return result; |
| } |
| |
| static void |
| post_process(struct properties *prop) |
| { |
| const char *target; |
| uint_least8_t result; |
| size_t i; |
| |
| /* post-mapping according to the line breaking algorithm */ |
| for (i = 0; i < UINT32_C(0x110000); i++) { |
| /* LB1 */ |
| if (!strcmp(line_break_property[prop[i].property].enumname, |
| "TMP_AI") || |
| !strcmp(line_break_property[prop[i].property].enumname, |
| "TMP_SG") || |
| !strcmp(line_break_property[prop[i].property].enumname, |
| "TMP_XX")) { |
| /* map AI, SG and XX to AL */ |
| target = "AL"; |
| } else if (!strcmp(line_break_property[prop[i].property] |
| .enumname, |
| "TMP_SA_WITH_MN_OR_MC")) { |
| /* map SA (with General_Category Mn or Mc) to CM */ |
| target = "CM"; |
| } else if (!strcmp(line_break_property[prop[i].property] |
| .enumname, |
| "TMP_SA_WITHOUT_MN_OR_MC")) { |
| /* map SA (without General_Category Mn or Mc) to AL */ |
| target = "AL"; |
| } else if (!strcmp(line_break_property[prop[i].property] |
| .enumname, |
| "TMP_CJ")) { |
| /* map CJ to NS */ |
| target = "NS"; |
| } else if ( |
| !strcmp(line_break_property[prop[i].property].enumname, |
| "TMP_CN") || |
| !strcmp(line_break_property[prop[i].property].enumname, |
| "TMP_EXTENDED_PICTOGRAPHIC") || |
| !strcmp(line_break_property[prop[i].property].enumname, |
| "TMP_MN") || |
| !strcmp(line_break_property[prop[i].property].enumname, |
| "TMP_MC") || |
| !strcmp(line_break_property[prop[i].property].enumname, |
| "TMP_EAW_H") || |
| !strcmp(line_break_property[prop[i].property].enumname, |
| "TMP_EAW_W") || |
| !strcmp(line_break_property[prop[i].property].enumname, |
| "TMP_EAW_F")) { |
| /* map all the temporary classes "residue" to AL */ |
| target = "AL"; |
| } else { |
| target = NULL; |
| } |
| |
| if (target) { |
| for (result = 0; result < LEN(line_break_property); |
| result++) { |
| if (!strcmp(line_break_property[result] |
| .enumname, |
| target)) { |
| break; |
| } |
| } |
| if (result == LEN(line_break_property)) { |
| fprintf(stderr, |
| "handle_conflict: Internal error.\n"); |
| exit(1); |
| } |
| |
| prop[i].property = result; |
| } |
| } |
| } |
| |
| int |
| main(int argc, char *argv[]) |
| { |
| (void)argc; |
| |
| properties_generate_break_property( |
| line_break_property, LEN(line_break_property), NULL, |
| handle_conflict, post_process, "line_break", argv[0]); |
| |
| return 0; |
| } |