/*
*******************************************************************************
*
*   Copyright (C) 2002, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  props2.c
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2002feb24
*   created by: Markus W. Scherer
*
*   Parse more Unicode Character Database files and store
*   additional Unicode character properties in bit set vectors.
*/

#include <stdio.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/uscript.h"
#include "cstring.h"
#include "cmemory.h"
#include "utrie.h"
#include "uprops.h"
#include "propsvec.h"
#include "uparse.h"
#include "genprops.h"

#define FLAG(n) ((uint32_t)1<<(n))

/* data --------------------------------------------------------------------- */

static UNewTrie *trie;
uint32_t *pv;
static int32_t pvCount;

static uint32_t prevStart=0, prevLimit=0, prevValue=0;

/* prototypes --------------------------------------------------------------- */

static void
parseTwoFieldFile(char *filename, char *basename,
                  const char *ucdFile, const char *suffix,
                  UParseLineFn *lineFn,
                  UErrorCode *pErrorCode);

static void
parseArabicShaping(char *filename, char *basename,
                   const char *suffix,
                   UErrorCode *pErrorCode);

static void U_CALLCONV
ageLineFn(void *context,
          char *fields[][2], int32_t fieldCount,
          UErrorCode *pErrorCode);

static void U_CALLCONV
scriptsLineFn(void *context,
              char *fields[][2], int32_t fieldCount,
              UErrorCode *pErrorCode);

static void U_CALLCONV
blocksLineFn(void *context,
             char *fields[][2], int32_t fieldCount,
             UErrorCode *pErrorCode);

static void U_CALLCONV
propListLineFn(void *context,
               char *fields[][2], int32_t fieldCount,
               UErrorCode *pErrorCode);

static void U_CALLCONV
derivedPropListLineFn(void *context,
                      char *fields[][2], int32_t fieldCount,
                      UErrorCode *pErrorCode);

static void U_CALLCONV
eaWidthLineFn(void *context,
              char *fields[][2], int32_t fieldCount,
              UErrorCode *pErrorCode);

static void U_CALLCONV
lineBreakLineFn(void *context,
                char *fields[][2], int32_t fieldCount,
                UErrorCode *pErrorCode);

/* -------------------------------------------------------------------------- */

U_CFUNC void
initAdditionalProperties() {
    pv=upvec_open(UPROPS_VECTOR_WORDS, 20000);
}

U_CFUNC void
generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) {
    char *basename;

    basename=filename+uprv_strlen(filename);

    /* process various UCD .txt files */
    parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode);

    /*
     * UTR 24 says:
     * Section 2:
     *   "Common - For characters that may be used
     *             within multiple scripts,
     *             or any unassigned code points."
     *
     * Section 4:
     *   "The value COMMON is the default value,
     *    given to all code points that are not
     *    explicitly mentioned in the data file."
     */
    if(!upvec_setValue(pv, 0, 0x110000, 0, (uint32_t)USCRIPT_COMMON, UPROPS_SCRIPT_MASK, pErrorCode)) {
        fprintf(stderr, "genprops error: unable to set script code: %s\n", u_errorName(*pErrorCode));
        exit(*pErrorCode);
    }
    parseTwoFieldFile(filename, basename, "Scripts", suffix, scriptsLineFn, pErrorCode);

    parseTwoFieldFile(filename, basename, "Blocks", suffix, blocksLineFn, pErrorCode);

    parseTwoFieldFile(filename, basename, "PropList", suffix, propListLineFn, pErrorCode);

    parseTwoFieldFile(filename, basename, "DerivedCoreProperties", suffix, derivedPropListLineFn, pErrorCode);

    parseTwoFieldFile(filename, basename, "LineBreak", suffix, lineBreakLineFn, pErrorCode);

    parseArabicShaping(filename, basename, suffix, pErrorCode);

    /*
     * Preset East Asian Width defaults:
     * N for all
     * A for Private Use
     * W for plane 2
     */
    *pErrorCode=U_ZERO_ERROR;
    if( !upvec_setValue(pv, 0, 0x110000, 0, (uint32_t)(U_EA_NEUTRAL<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
        !upvec_setValue(pv, 0xe000, 0xf900, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
        !upvec_setValue(pv, 0xf0000, 0xffffe, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
        !upvec_setValue(pv, 0x100000, 0x10fffe, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
        !upvec_setValue(pv, 0x20000, 0x2fffe, 0, (uint32_t)(U_EA_WIDE<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode)
    ) {
        fprintf(stderr, "genprops: unable to set default East Asian Widths: %s\n", u_errorName(*pErrorCode));
        exit(*pErrorCode);
    }
    prevStart=prevLimit=prevValue=0;
    /* parse EastAsianWidth.txt */
    parseTwoFieldFile(filename, basename, "EastAsianWidth", suffix, eaWidthLineFn, pErrorCode);
    /* set last range */
    if(!upvec_setValue(pv, prevStart, prevLimit, 0, (uint32_t)(prevValue<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode)) {
        fprintf(stderr, "genprops error: unable to set East Asian Width: %s\n", u_errorName(*pErrorCode));
        exit(*pErrorCode);
    }

    trie=utrie_open(NULL, NULL, 50000, 0, FALSE);
    if(trie==NULL) {
        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
        upvec_close(pv);
        return;
    }

    pvCount=upvec_toTrie(pv, trie, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n", u_errorName(*pErrorCode));
        exit(*pErrorCode);
    }
}

static void
parseTwoFieldFile(char *filename, char *basename,
                  const char *ucdFile, const char *suffix,
                  UParseLineFn *lineFn,
                  UErrorCode *pErrorCode) {
    char *fields[2][2];

    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return;
    }

    writeUCDFilename(basename, ucdFile, suffix);

    u_parseDelimitedFile(filename, ';', fields, 2, lineFn, NULL, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
    }
}

/* DerivedAge.txt ----------------------------------------------------------- */

static void U_CALLCONV
ageLineFn(void *context,
          char *fields[][2], int32_t fieldCount,
          UErrorCode *pErrorCode) {
    char *s, *end;
    uint32_t value, start, limit, version;

    u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]);
        exit(*pErrorCode);
    }
    ++limit;

    /* parse version number */
    s=(char *)u_skipWhitespace(fields[1][0]);
    value=(uint32_t)uprv_strtoul(s, &end, 10);
    if(s==end || value==0 || value>15 || (*end!='.' && *end!=' ' && *end!='\t' && *end!=0)) {
        fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }
    version=value<<4;

    /* parse minor version number */
    if(*end=='.') {
        s=(char *)u_skipWhitespace(end+1);
        value=(uint32_t)uprv_strtoul(s, &end, 10);
        if(s==end || value>15 || (*end!=' ' && *end!='\t' && *end!=0)) {
            fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
            *pErrorCode=U_PARSE_ERROR;
            exit(U_PARSE_ERROR);
        }
        version|=value;
    }

    if(!upvec_setValue(pv, start, limit, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode)) {
        fprintf(stderr, "genprops error: unable to set character age: %s\n", u_errorName(*pErrorCode));
        exit(*pErrorCode);
    }
}

/* Scripts.txt -------------------------------------------------------------- */

static void U_CALLCONV
scriptsLineFn(void *context,
              char *fields[][2], int32_t fieldCount,
              UErrorCode *pErrorCode) {
    char *s, *end;
    uint32_t start, limit;
    UScriptCode script;

    u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "genprops: syntax error in Scripts.txt field 0 at %s\n", fields[0][0]);
        exit(*pErrorCode);
    }
    ++limit;

    /* parse script name */
    s=(char *)u_skipWhitespace(fields[1][0]);

    /* trim trailing whitespace */
    end=fields[1][1];
    while(s<end && (*(end-1)==' ' || *(end-1)=='\t')) {
        --end;
    }
    *end=0;
    if( 1!=uscript_getCode(s, &script, 1, pErrorCode) ||
        U_FAILURE(*pErrorCode) ||
        script<=USCRIPT_INVALID_CODE
    ) {
        fprintf(stderr, "genprops error: unknown script name in Scripts.txt field 1 at %s\n", fields[1][0]);
        if(U_SUCCESS(*pErrorCode)) {
            *pErrorCode=U_PARSE_ERROR;
        }
        exit(*pErrorCode);
    }

    if(!upvec_setValue(pv, start, limit, 0, (uint32_t)script, UPROPS_SCRIPT_MASK, pErrorCode)) {
        fprintf(stderr, "genprops error: unable to set script code: %s\n", u_errorName(*pErrorCode));
        exit(*pErrorCode);
    }
}

/* Blocks.txt --------------------------------------------------------------- */

/* Blocks.txt block names in the order of the parallel UBlockCode constants */
static const char *const
blockNames[UBLOCK_COUNT]={
    NULL,                                       /* 0 */
    "Basic Latin",
    "Latin-1 Supplement",
    "Latin Extended-A",
    "Latin Extended-B",
    "IPA Extensions",
    "Spacing Modifier Letters",
    "Combining Diacritical Marks",
    "Greek and Coptic",                         /* used to be just "Greek" before Unicode 3.2 */
    "Cyrillic",
    "Armenian",                                 /* 10 */
    "Hebrew",
    "Arabic",
    "Syriac",
    "Thaana",
    "Devanagari",
    "Bengali",
    "Gurmukhi",
    "Gujarati",
    "Oriya",
    "Tamil",                                    /* 20 */
    "Telugu",
    "Kannada",
    "Malayalam",
    "Sinhala",
    "Thai",
    "Lao",
    "Tibetan",
    "Myanmar",
    "Georgian",
    "Hangul Jamo",                              /* 30 */
    "Ethiopic",
    "Cherokee",
    "Unified Canadian Aboriginal Syllabics",
    "Ogham",
    "Runic",
    "Khmer",
    "Mongolian",
    "Latin Extended Additional",
    "Greek Extended",
    "General Punctuation",                      /* 40 */
    "Superscripts and Subscripts",
    "Currency Symbols",
    "Combining Diacritical Marks for Symbols",  /* used to be "Combining Marks for Symbols" before Unicode 3.2 */
    "Letterlike Symbols",
    "Number Forms",
    "Arrows",
    "Mathematical Operators",
    "Miscellaneous Technical",
    "Control Pictures",
    "Optical Character Recognition",            /* 50 */
    "Enclosed Alphanumerics",
    "Box Drawing",
    "Block Elements",
    "Geometric Shapes",
    "Miscellaneous Symbols",
    "Dingbats",
    "Braille Patterns",
    "CJK Radicals Supplement",
    "Kangxi Radicals",
    "Ideographic Description Characters",       /* 60 */
    "CJK Symbols and Punctuation",
    "Hiragana",
    "Katakana",
    "Bopomofo",
    "Hangul Compatibility Jamo",
    "Kanbun",
    "Bopomofo Extended",
    "Enclosed CJK Letters and Months",
    "CJK Compatibility",
    "CJK Unified Ideographs Extension A",       /* 70 */
    "CJK Unified Ideographs",
    "Yi Syllables",
    "Yi Radicals",
    "Hangul Syllables",
    "High Surrogates",
    "High Private Use Surrogates",
    "Low Surrogates",
    "Private Use Area",                         /* used to be "Private Use" before Unicode 3.2 */
    "CJK Compatibility Ideographs",
    "Alphabetic Presentation Forms",            /* 80 */
    "Arabic Presentation Forms-A",
    "Combining Half Marks",
    "CJK Compatibility Forms",
    "Small Form Variants",
    "Arabic Presentation Forms-B",
    "Specials",
    "Halfwidth and Fullwidth Forms",
    "Old Italic",
    "Gothic",
    "Deseret",                                  /* 90 */
    "Byzantine Musical Symbols",
    "Musical Symbols",
    "Mathematical Alphanumeric Symbols",
    "CJK Unified Ideographs Extension B",
    "CJK Compatibility Ideographs Supplement",
    "Tags",
    "Cyrillic Supplementary",                   /* first new block in Unicode 3.2 */
    "Tagalog",
    "Hanunoo",
    "Buhid",                                    /* 100 */
    "Tagbanwa",
    "Miscellaneous Mathematical Symbols-A",
    "Supplemental Arrows-A",
    "Supplemental Arrows-B",
    "Miscellaneous Mathematical Symbols-B",
    "Supplemental Mathematical Operators",
    "Katakana Phonetic Extensions",
    "Variation Selectors",
    "Supplementary Private Use Area-A",
    "Supplementary Private Use Area-B"          /* 110 */
};

static void U_CALLCONV
blocksLineFn(void *context,
             char *fields[][2], int32_t fieldCount,
             UErrorCode *pErrorCode) {
    uint32_t start, limit;
    int32_t i;

    u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "genprops: syntax error in Blocks.txt field 0 at %s\n", fields[0][0]);
        exit(*pErrorCode);
    }
    ++limit;

    /* parse block name */
    i=getTokenIndex(blockNames, UBLOCK_COUNT, fields[1][0]);
    if(i<0) {
        if(isToken("Greek", fields[1][0])) {
            i=UBLOCK_GREEK; /* Unicode 3.2 renames this to "Greek and Coptic" */
        } else if(isToken("Combining Marks for Symbols", fields[1][0])) {
            i=UBLOCK_COMBINING_MARKS_FOR_SYMBOLS; /* Unicode 3.2 renames this to "Combining Diacritical Marks for Symbols" */
        } else if(isToken("Private Use", fields[1][0])) {
            i=UBLOCK_PRIVATE_USE; /* Unicode 3.2 renames this to "Private Use Area" */
        } else {
            fprintf(stderr, "genprops error: unknown block name \"%s\" in Blocks.txt\n", fields[1][0]);
            *pErrorCode=U_PARSE_ERROR;
            exit(U_PARSE_ERROR);
        }
    }

    if(!upvec_setValue(pv, start, limit, 0, (uint32_t)i<<UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK, pErrorCode)) {
        fprintf(stderr, "genprops error: unable to set block code: %s\n", u_errorName(*pErrorCode));
        exit(*pErrorCode);
    }
}

/* PropList.txt ------------------------------------------------------------- */

/*
 * Keep this list of property names in sync with
 * enums in icu/source/common/uprops.h, see UPROPS_BINARY_1_TOP!
 *
 * Careful: Since UPROPS_ also contain derivedPropListNames[] entries,
 * they would need to be skipped here with NULL entries if new properties
 * are added to PropList.txt.
 */
static const char *const
propListNames[]={
    "White_Space",
    "Bidi_Control",
    "Join_Control",
    "Dash",
    "Hyphen",
    "Quotation_Mark",
    "Terminal_Punctuation",
    "Other_Math",
    "Hex_Digit",
    "ASCII_Hex_Digit",
    "Other_Alphabetic",
    "Ideographic",
    "Diacritic",
    "Extender",
    "Other_Lowercase",
    "Other_Uppercase",
    "Noncharacter_Code_Point",
    "Other_Grapheme_Extend",
    "Grapheme_Link",
    "IDS_Binary_Operator",
    "IDS_Trinary_Operator",
    "Radical",
    "Unified_Ideograph",
    "Other_Default_Ignorable_Code_Point",
    "Deprecated",
    "Soft_Dotted",
    "Logical_Order_Exception"
};

static void U_CALLCONV
propListLineFn(void *context,
               char *fields[][2], int32_t fieldCount,
               UErrorCode *pErrorCode) {
    uint32_t start, limit;
    int32_t i;

    u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "genprops: syntax error in PropList.txt field 0 at %s\n", fields[0][0]);
        exit(*pErrorCode);
    }
    ++limit;

    /* parse binary property name */
    i=getTokenIndex(propListNames, sizeof(propListNames)/sizeof(*propListNames), fields[1][0]);
    if(i<0) {
        if(isToken("White_space", fields[1][0])) {
            i=0; /* accept misspelled property name in Unicode 3.1.1 */
        } else {
            fprintf(stderr, "genprops warning: unknown binary property name \"%s\" in PropList.txt\n", fields[1][0]);
            return;
        }
    }
    if(!upvec_setValue(pv, start, limit, 1, FLAG(i), FLAG(i), pErrorCode)) {
        fprintf(stderr, "genprops error: unable to set binary property: %s\n", u_errorName(*pErrorCode));
        exit(*pErrorCode);
    }
}

/* DerivedCoreProperties ---------------------------------------------------- */

static const char *const
derivedPropListNames[]={
    "XID_Start",
    "XID_Continue"
};

static void U_CALLCONV
derivedPropListLineFn(void *context,
                      char *fields[][2], int32_t fieldCount,
                      UErrorCode *pErrorCode) {
    uint32_t start, limit;
    int32_t i;

    u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "genprops: syntax error in DerivedCoreProperties.txt field 0 at %s\n", fields[0][0]);
        exit(*pErrorCode);
    }
    ++limit;

    /* parse derived binary property name, ignore unknown names */
    i=getTokenIndex(derivedPropListNames, sizeof(derivedPropListNames)/sizeof(*derivedPropListNames), fields[1][0]);
    if(i>=0) {
        uint32_t flag=FLAG(UPROPS_XID_START+i);
        if(!upvec_setValue(pv, start, limit, 1, flag, flag, pErrorCode)) {
            fprintf(stderr, "genprops error: unable to set derived binary property: %s\n", u_errorName(*pErrorCode));
            exit(*pErrorCode);
        }
    }
}

/* East Asian Width --------------------------------------------------------- */

/* keep this list in sync with UEAWidthCode in uprops.h or uchar.h */
static const char *const
eaNames[U_EA_COUNT]={
    "N",        /* Non-East Asian Neutral, default for unassigned code points */
    "A",        /* Ambiguous, default for Private Use code points */
    "H",        /* Half-width */
    "F",        /* Full-width */
    "Na",       /* Narrow */
    "W"         /* Wide, default for plane 2 */
};

static void U_CALLCONV
eaWidthLineFn(void *context,
              char *fields[][2], int32_t fieldCount,
              UErrorCode *pErrorCode) {
    uint32_t start, limit;
    int32_t i;

    u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "genprops: syntax error in EastAsianWidth.txt field 0 at %s\n", fields[0][0]);
        exit(*pErrorCode);
    }
    ++limit;

    /* parse binary property name */
    i=getTokenIndex(eaNames, U_EA_COUNT, fields[1][0]);
    if(i<0) {
        fprintf(stderr, "genprops error: unknown width name \"%s\" in EastAsianWidth.txt\n", fields[1][0]);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    /* collect maximum ranges */
    if(prevLimit==start && (uint32_t)i==prevValue) {
        prevLimit=limit;
    } else {
        if(!upvec_setValue(pv, prevStart, prevLimit, 0, (uint32_t)(prevValue<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode)) {
            fprintf(stderr, "genprops error: unable to set East Asian Width: %s\n", u_errorName(*pErrorCode));
            exit(*pErrorCode);
        }
        prevStart=start;
        prevLimit=limit;
        prevValue=(uint32_t)i;
    }
}

/* LineBreak.txt ------------------------------------------------------------ */

/* LineBreak.txt block names in the order of the parallel ULineBreak constants */
static const char *const
lbNames[U_LB_COUNT]={
    "XX",
    "AI",
    "AL",
    "B2",
    "BA",
    "BB",
    "BK",
    "CB",
    "CL",
    "CM",
    "CR",
    "EX",
    "GL",
    "HY",
    "ID",
    "IN",
    "IS",
    "LF",
    "NS",
    "NU",
    "OP",
    "PO",
    "PR",
    "QU",
    "SA",
    "SG",
    "SP",
    "SY",
    "ZW"
};

static void U_CALLCONV
lineBreakLineFn(void *context,
                char *fields[][2], int32_t fieldCount,
                UErrorCode *pErrorCode) {
    uint32_t start, limit;
    int32_t i;

    u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "genprops: syntax error in LineBreak.txt field 0 at %s\n", fields[0][0]);
        exit(*pErrorCode);
    }
    ++limit;

    /* parse block name */
    i=getTokenIndex(lbNames, U_LB_COUNT, fields[1][0]);
    if(i<0) {
        fprintf(stderr, "genprops error: unknown line break name \"%s\" in LineBreak.txt\n", fields[1][0]);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    if(!upvec_setValue(pv, start, limit, 0, (uint32_t)i<<UPROPS_LB_SHIFT, UPROPS_LB_MASK, pErrorCode)) {
        fprintf(stderr, "genprops error: unable to set line break code: %s\n", u_errorName(*pErrorCode));
        exit(*pErrorCode);
    }
}

/* ArabicShaping.txt -------------------------------------------------------- */

/* Joining Type/Joining Group names in the order of the parallel UJoiningType/UJoiningGroup constants */
static const char *const
jtNames[U_JT_COUNT]={
    "U",
    "C",
    "D",
    "L",
    "R",
    "T"
};

static const char *const
jgNames[U_JG_COUNT]={
    "<no shaping>",
    "AIN",
    "ALAPH",
    "ALEF",
    "BEH",
    "BETH",
    "DAL",
    "DALATH RISH",
    "E",
    "FEH",
    "FINAL SEMKATH",
    "GAF",
    "GAMAL",
    "HAH",
    "HAMZA ON HEH GOAL",
    "HE",
    "HEH",
    "HEH GOAL",
    "HETH",
    "KAF",
    "KAPH",
    "KNOTTED HEH",
    "LAM",
    "LAMADH",
    "MEEM",
    "MIM",
    "NOON",
    "NUN",
    "PE",
    "QAF",
    "QAPH",
    "REH",
    "REVERSED PE",
    "SAD",
    "SADHE",
    "SEEN",
    "SEMKATH",
    "SHIN",
    "SWASH KAF",
    "SYRIAC WAW",
    "TAH",
    "TAW",
    "TEH MARBUTA",
    "TETH",
    "WAW",
    "YEH",
    "YEH BARREE",
    "YEH WITH TAIL",
    "YUDH",
    "YUDH HE",
    "ZAIN"
};

static void U_CALLCONV
arabicShapingLineFn(void *context,
                    char *fields[][2], int32_t fieldCount,
                    UErrorCode *pErrorCode) {
    uint32_t start, limit;
    int32_t jt, jg;

    u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "genprops: syntax error in ArabicShaping.txt field 0 at %s\n", fields[0][0]);
        exit(*pErrorCode);
    }
    ++limit;

    /* parse joining type */
    jt=getTokenIndex(jtNames, U_JT_COUNT, fields[2][0]);
    if(jt<0) {
        fprintf(stderr, "genprops error: unknown joining type in \"%s\" in ArabicShaping.txt\n", fields[2][0]);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    /* parse joining group */
    jg=getTokenIndex(jgNames, U_JG_COUNT, fields[3][0]);
    if(jg<0) {
        fprintf(stderr, "genprops error: unknown joining group in \"%s\" in ArabicShaping.txt\n", fields[3][0]);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    if(!upvec_setValue(pv, start, limit, 2, ((uint32_t)jt<<UPROPS_JT_SHIFT)|((uint32_t)jg<<UPROPS_JG_SHIFT), UPROPS_JT_MASK|UPROPS_JG_MASK, pErrorCode)) {
        fprintf(stderr, "genprops error: unable to set joining type/group code: %s\n", u_errorName(*pErrorCode));
        exit(*pErrorCode);
    }
}

static void
parseArabicShaping(char *filename, char *basename,
                   const char *suffix,
                   UErrorCode *pErrorCode) {
    char *fields[4][2];

    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return;
    }

    writeUCDFilename(basename, "ArabicShaping", suffix);

    u_parseDelimitedFile(filename, ';', fields, 4, arabicShapingLineFn, NULL, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "error parsing ArabicShaping.txt: %s\n", u_errorName(*pErrorCode));
    }
}

/* data serialization ------------------------------------------------------- */

U_CFUNC int32_t
writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_COUNT]) {
    int32_t length;
    UErrorCode errorCode;

    errorCode=U_ZERO_ERROR;
    length=utrie_serialize(trie, p, capacity, getFoldedPropsValue, TRUE, &errorCode);
    if(U_FAILURE(errorCode)) {
        fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
        exit(errorCode);
    }
    if(p!=NULL) {
        p+=length;
        capacity-=length;
        if(beVerbose) {
            printf("size in bytes of additional props trie:%5u\n", length);
        }

        /* set indexes */
        indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
            indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
        indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
        indexes[UPROPS_RESERVED_INDEX]=
            indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;

        indexes[UPROPS_MAX_VALUES_INDEX]=
            (((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
            ((int32_t)USCRIPT_CODE_LIMIT-1);
    }

    if(p!=NULL && (pvCount*4)<=capacity) {
        uprv_memcpy(p, pv, pvCount*4);
        if(beVerbose) {
            printf("number of additional props vectors:    %5u\n", pvCount/UPROPS_VECTOR_WORDS);
            printf("number of 32-bit words per vector:     %5u\n", UPROPS_VECTOR_WORDS);
        }
    }
    length+=pvCount*4;

    if(p!=NULL) {
        utrie_close(trie);
        upvec_close(pv);
    }
    return length;
}
