unicode/c/genprops/props2.cpp - external/github.com/unicode-org/icu - Git at Google

 /*
 *******************************************************************************
 *
 *   Copyright (C) 2002-2010, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
 *   file name:  props2.cpp
 *   encoding:   US-ASCII
 *   tab size:   8 (not used)
 *   indentation:4
 *
 *   created on: 2002feb24
 *   created by: Markus W. Scherer
 *
 *   Parse more Unicode Character Database files and store
 *   additional Unicode character properties in bit set vectors.
 */

 #include <stdio.h>
 #include "unicode/utypes.h"
 #include "unicode/uchar.h"
 #include "unicode/unistr.h"
 #include "unicode/uscript.h"
 #include "cstring.h"
 #include "cmemory.h"
 #include "utrie2.h"
 #include "uprops.h"
 #include "propsvec.h"
 #include "uparse.h"
 #include "writesrc.h"
 #include "genprops.h"

 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))

 U_NAMESPACE_USE

 /* data --------------------------------------------------------------------- */

 static UTrie2 *newTrie;
 UPropsVectors *pv;

 static UnicodeString *scriptExtensions;

 /* miscellaneous ------------------------------------------------------------ */

 static char *
 trimTerminateField(char *s, char *limit) {
     /* trim leading whitespace */
     s=(char *)u_skipWhitespace(s);

     /* trim trailing whitespace */
     while(s<limit && U_IS_INV_WHITESPACE(*(limit-1))) {
         --limit;
     }
     *limit=0;

     return s;
 }

 static void
 parseTwoFieldFile(char *filename, char *basename,
                   const char *ucdFile, const char *suffix,
                   UParseLineFn *lineFn,
                   UErrorCode *pErrorCode) {
     char *fields[2][2];

     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
         return;
     }

     writeUCDFilename(basename, ucdFile, suffix);

     u_parseDelimitedFile(filename, ';', fields, 2, lineFn, NULL, pErrorCode);
     if(U_FAILURE(*pErrorCode)) {
         fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
     }
 }

 static void U_CALLCONV
 ageLineFn(void *context,
           char *fields[][2], int32_t fieldCount,
           UErrorCode *pErrorCode);

 static void U_CALLCONV
 scriptExtensionsLineFn(void *context,
                        char *fields[][2], int32_t fieldCount,
                        UErrorCode *pErrorCode);

 static void
 parseMultiFieldFile(char *filename, char *basename,
                     const char *ucdFile, const char *suffix,
                     int32_t fieldCount,
                     UParseLineFn *lineFn,
                     UErrorCode *pErrorCode) {
     char *fields[20][2];

     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
         return;
     }

     writeUCDFilename(basename, ucdFile, suffix);

     u_parseDelimitedFile(filename, ';', fields, fieldCount, lineFn, NULL, pErrorCode);
     if(U_FAILURE(*pErrorCode)) {
         fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
     }
 }

 static void U_CALLCONV
 numericLineFn(void *context,
               char *fields[][2], int32_t fieldCount,
               UErrorCode *pErrorCode);

 /* parse files with single enumerated properties ---------------------------- */

 struct SingleEnum {
     const char *ucdFile, *propName;
     UProperty prop;
     int32_t vecWord, vecShift;
     uint32_t vecMask;
 };
 typedef struct SingleEnum SingleEnum;

 static void
 parseSingleEnumFile(char *filename, char *basename, const char *suffix,
                     const SingleEnum *sen,
                     UErrorCode *pErrorCode);

 static const SingleEnum scriptSingleEnum={
     "Scripts", "script",
     UCHAR_SCRIPT,
     0, 0, UPROPS_SCRIPT_MASK
 };

 static const SingleEnum blockSingleEnum={
     "Blocks", "block",
     UCHAR_BLOCK,
     0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK
 };

 static const SingleEnum graphemeClusterBreakSingleEnum={
     "GraphemeBreakProperty", "Grapheme_Cluster_Break",
     UCHAR_GRAPHEME_CLUSTER_BREAK,
     2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK
 };

 static const SingleEnum wordBreakSingleEnum={
     "WordBreakProperty", "Word_Break",
     UCHAR_WORD_BREAK,
     2, UPROPS_WB_SHIFT, UPROPS_WB_MASK
 };

 static const SingleEnum sentenceBreakSingleEnum={
     "SentenceBreakProperty", "Sentence_Break",
     UCHAR_SENTENCE_BREAK,
     2, UPROPS_SB_SHIFT, UPROPS_SB_MASK
 };

 static const SingleEnum lineBreakSingleEnum={
     "LineBreak", "line break",
     UCHAR_LINE_BREAK,
     2, UPROPS_LB_SHIFT, UPROPS_LB_MASK
 };

 static const SingleEnum eawSingleEnum={
     "EastAsianWidth", "east asian width",
     UCHAR_EAST_ASIAN_WIDTH,
     0, UPROPS_EA_SHIFT, UPROPS_EA_MASK
 };

 static void U_CALLCONV
 singleEnumLineFn(void *context,
                  char *fields[][2], int32_t fieldCount,
                  UErrorCode *pErrorCode) {
     const SingleEnum *sen;
     char *s;
     uint32_t start, end, uv;
     int32_t value;

     sen=(const SingleEnum *)context;

     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
     if(U_FAILURE(*pErrorCode)) {
         fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", sen->ucdFile, fields[0][0]);
         exit(*pErrorCode);
     }

     /* parse property alias */
     s=trimTerminateField(fields[1][0], fields[1][1]);
     value=u_getPropertyValueEnum(sen->prop, s);
     if(value<0) {
         if(sen->prop==UCHAR_BLOCK) {
             if(isToken("Greek", s)) {
                 value=UBLOCK_GREEK; /* Unicode 3.2 renames this to "Greek and Coptic" */
             } else if(isToken("Combining Marks for Symbols", s)) {
                 value=UBLOCK_COMBINING_MARKS_FOR_SYMBOLS; /* Unicode 3.2 renames this to "Combining Diacritical Marks for Symbols" */
             } else if(isToken("Private Use", s)) {
                 value=UBLOCK_PRIVATE_USE; /* Unicode 3.2 renames this to "Private Use Area" */
             }
         }
     }
     if(value<0) {
         fprintf(stderr, "genprops error: unknown %s name in %s.txt field 1 at %s\n",
                         sen->propName, sen->ucdFile, s);
         exit(U_PARSE_ERROR);
     }

     uv=(uint32_t)(value<<sen->vecShift);
     if((uv&sen->vecMask)!=uv) {
         fprintf(stderr, "genprops error: %s value overflow (0x%x) at %s\n",
                         sen->propName, (int)uv, s);
         exit(U_INTERNAL_PROGRAM_ERROR);
     }

     if(start==0 && end==0x10ffff) {
         /* Also set bits for initialValue and errorValue. */
         end=UPVEC_MAX_CP;
     }
     upvec_setValue(pv, start, end, sen->vecWord, uv, sen->vecMask, pErrorCode);
     if(U_FAILURE(*pErrorCode)) {
         fprintf(stderr, "genprops error: unable to set %s code: %s\n",
                         sen->propName, u_errorName(*pErrorCode));
         exit(*pErrorCode);
     }
 }

 static void
 parseSingleEnumFile(char *filename, char *basename, const char *suffix,
                     const SingleEnum *sen,
                     UErrorCode *pErrorCode) {
     char *fields[2][2];

     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
         return;
     }

     writeUCDFilename(basename, sen->ucdFile, suffix);

     u_parseDelimitedFile(filename, ';', fields, 2, singleEnumLineFn, (void *)sen, pErrorCode);
     if(U_FAILURE(*pErrorCode)) {
         fprintf(stderr, "error parsing %s.txt: %s\n", sen->ucdFile, u_errorName(*pErrorCode));
     }
 }

 /* parse files with multiple binary properties ------------------------------ */

 struct Binary {
     const char *propName;
     int32_t vecWord, vecShift;
 };
 typedef struct Binary Binary;

 struct Binaries {
     const char *ucdFile;
     const Binary *binaries;
     int32_t binariesCount;
 };
 typedef struct Binaries Binaries;

 static const Binary
 propListNames[]={
     { "White_Space",                        1, UPROPS_WHITE_SPACE },
     { "Dash",                               1, UPROPS_DASH },
     { "Hyphen",                             1, UPROPS_HYPHEN },
     { "Quotation_Mark",                     1, UPROPS_QUOTATION_MARK },
     { "Terminal_Punctuation",               1, UPROPS_TERMINAL_PUNCTUATION },
     { "Hex_Digit",                          1, UPROPS_HEX_DIGIT },
     { "ASCII_Hex_Digit",                    1, UPROPS_ASCII_HEX_DIGIT },
     { "Ideographic",                        1, UPROPS_IDEOGRAPHIC },
     { "Diacritic",                          1, UPROPS_DIACRITIC },
     { "Extender",                           1, UPROPS_EXTENDER },
     { "Noncharacter_Code_Point",            1, UPROPS_NONCHARACTER_CODE_POINT },
     { "Grapheme_Link",                      1, UPROPS_GRAPHEME_LINK },
     { "IDS_Binary_Operator",                1, UPROPS_IDS_BINARY_OPERATOR },
     { "IDS_Trinary_Operator",               1, UPROPS_IDS_TRINARY_OPERATOR },
     { "Radical",                            1, UPROPS_RADICAL },
     { "Unified_Ideograph",                  1, UPROPS_UNIFIED_IDEOGRAPH },
     { "Deprecated",                         1, UPROPS_DEPRECATED },
     { "Logical_Order_Exception",            1, UPROPS_LOGICAL_ORDER_EXCEPTION },

     /* new properties in Unicode 4.0.1 */
     { "STerm",                              1, UPROPS_S_TERM },
     { "Variation_Selector",                 1, UPROPS_VARIATION_SELECTOR },

     /* new properties in Unicode 4.1 */
     { "Pattern_Syntax",                     1, UPROPS_PATTERN_SYNTAX },
     { "Pattern_White_Space",                1, UPROPS_PATTERN_WHITE_SPACE }
 };

 static const Binaries
 propListBinaries={
     "PropList", propListNames, LENGTHOF(propListNames)
 };

 static const Binary
 derCorePropsNames[]={
     { "XID_Start",                          1, UPROPS_XID_START },
     { "XID_Continue",                       1, UPROPS_XID_CONTINUE },

     /* before Unicode 4/ICU 2.6/format version 3.2, these used to be Other_XYZ from PropList.txt */
     { "Math",                               1, UPROPS_MATH },
     { "Alphabetic",                         1, UPROPS_ALPHABETIC },
     { "Grapheme_Extend",                    1, UPROPS_GRAPHEME_EXTEND },
     { "Default_Ignorable_Code_Point",       1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT },

     /* new properties bits in ICU 2.6/format version 3.2 */
     { "ID_Start",                           1, UPROPS_ID_START },
     { "ID_Continue",                        1, UPROPS_ID_CONTINUE },
     { "Grapheme_Base",                      1, UPROPS_GRAPHEME_BASE },

     /*
      * Unicode 5/ICU 3.6 moves Grapheme_Link from PropList.txt
      * to DerivedCoreProperties.txt and deprecates it.
      */
     { "Grapheme_Link",                      1, UPROPS_GRAPHEME_LINK }
 };

 static const Binaries
 derCorePropsBinaries={
     "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
 };

 static char ignoredProps[100][64];
 static int32_t ignoredPropsCount;

 static void
 addIgnoredProp(char *s, char *limit) {
     int32_t i;

     s=trimTerminateField(s, limit);
     for(i=0; i<ignoredPropsCount; ++i) {
         if(0==uprv_strcmp(ignoredProps[i], s)) {
             return;
         }
     }
     uprv_strcpy(ignoredProps[ignoredPropsCount++], s);
 }

 static void U_CALLCONV
 binariesLineFn(void *context,
                char *fields[][2], int32_t fieldCount,
                UErrorCode *pErrorCode) {
     const Binaries *bin;
     char *s;
     uint32_t start, end, uv;
     int32_t i;

     bin=(const Binaries *)context;

     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
     if(U_FAILURE(*pErrorCode)) {
         fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
         exit(*pErrorCode);
     }

     /* parse binary property name */
     s=(char *)u_skipWhitespace(fields[1][0]);
     for(i=0;; ++i) {
         if(i==bin->binariesCount) {
             /* ignore unrecognized properties */
             if(beVerbose) {
                 addIgnoredProp(s, fields[1][1]);
             }
             return;
         }
         if(isToken(bin->binaries[i].propName, s)) {
             break;
         }
     }

     if(bin->binaries[i].vecShift>=32) {
         fprintf(stderr, "genprops error: shift value %d>=32 for %s %s\n",
                         (int)bin->binaries[i].vecShift, bin->ucdFile, bin->binaries[i].propName);
         exit(U_INTERNAL_PROGRAM_ERROR);
     }
     uv=U_MASK(bin->binaries[i].vecShift);

     if(start==0 && end==0x10ffff) {
         /* Also set bits for initialValue and errorValue. */
         end=UPVEC_MAX_CP;
     }
     upvec_setValue(pv, start, end, bin->binaries[i].vecWord, uv, uv, pErrorCode);
     if(U_FAILURE(*pErrorCode)) {
         fprintf(stderr, "genprops error: unable to set %s code: %s\n",
                         bin->binaries[i].propName, u_errorName(*pErrorCode));
         exit(*pErrorCode);
     }
 }

 static void
 parseBinariesFile(char *filename, char *basename, const char *suffix,
                   const Binaries *bin,
                   UErrorCode *pErrorCode) {
     char *fields[2][2];
     int32_t i;

     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
         return;
     }

     writeUCDFilename(basename, bin->ucdFile, suffix);

     ignoredPropsCount=0;

     u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
     if(U_FAILURE(*pErrorCode)) {
         fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
     }

     if(beVerbose) {
         for(i=0; i<ignoredPropsCount; ++i) {
             printf("genprops: ignoring property %s in %s.txt\n", ignoredProps[i], bin->ucdFile);
         }
     }
 }

 /* -------------------------------------------------------------------------- */

 U_CFUNC void
 initAdditionalProperties() {
     UErrorCode errorCode=U_ZERO_ERROR;
     pv=upvec_open(UPROPS_VECTOR_WORDS, &errorCode);
     if(U_FAILURE(errorCode)) {
         fprintf(stderr, "error: upvec_open() failed - %s\n", u_errorName(errorCode));
         exit(errorCode);
     }
     scriptExtensions=new UnicodeString;
 }

 U_CFUNC void
 exitAdditionalProperties() {
     utrie2_close(newTrie);
     upvec_close(pv);
     delete scriptExtensions;
 }

 U_CFUNC void
 generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) {
     char *basename;

     basename=filename+uprv_strlen(filename);

     /* process various UCD .txt files */

     /* add Han numeric types & values */
     parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode);

     parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode);

     parseSingleEnumFile(filename, basename, suffix, &scriptSingleEnum, pErrorCode);

     parseTwoFieldFile(filename, basename, "ScriptExtensions", suffix, scriptExtensionsLineFn, pErrorCode);

     parseSingleEnumFile(filename, basename, suffix, &blockSingleEnum, pErrorCode);

     parseBinariesFile(filename, basename, suffix, &propListBinaries, pErrorCode);

     parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, pErrorCode);

     parseSingleEnumFile(filename, basename, suffix, &graphemeClusterBreakSingleEnum, pErrorCode);

     parseSingleEnumFile(filename, basename, suffix, &wordBreakSingleEnum, pErrorCode);

     parseSingleEnumFile(filename, basename, suffix, &sentenceBreakSingleEnum, pErrorCode);

     /*
      * LineBreak-4.0.0.txt:
      *  - All code points, assigned and unassigned, that are not listed
      *         explicitly are given the value "XX".
      *
      * XX==U_LB_UNKNOWN==0 - nothing to do
      */
     parseSingleEnumFile(filename, basename, suffix, &lineBreakSingleEnum, pErrorCode);

     /*
      * Preset East Asian Width defaults:
      *
      * http://www.unicode.org/reports/tr11/#Unassigned
      * 7.1 Unassigned and Private Use characters
      *
      * All unassigned characters are by default classified as non-East Asian neutral,
      * except for the range U+20000 to U+2FFFD,
      * since all code positions from U+20000 to U+2FFFD are intended for CJK ideographs (W).
      * All Private use characters are by default classified as ambiguous,
      * since their definition depends on context.
      *
      * N for all ==0 - nothing to do
      * A for Private Use
      * W for plane 2
      */
     *pErrorCode=U_ZERO_ERROR;
     upvec_setValue(pv, 0xe000, 0xf8ff, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
     upvec_setValue(pv, 0xf0000, 0xffffd, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
     upvec_setValue(pv, 0x100000, 0x10fffd, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
     upvec_setValue(pv, 0x20000, 0x2fffd, 0, (uint32_t)(U_EA_WIDE<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
     if(U_FAILURE(*pErrorCode)) {
         fprintf(stderr, "genprops: unable to set default East Asian Widths: %s\n", u_errorName(*pErrorCode));
         exit(*pErrorCode);
     }

     /* parse EastAsianWidth.txt */
     parseSingleEnumFile(filename, basename, suffix, &eawSingleEnum, pErrorCode);

     newTrie=upvec_compactToUTrie2WithRowIndexes(pv, pErrorCode);
     if(U_FAILURE(*pErrorCode)) {
         fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n",
                 u_errorName(*pErrorCode));
         exit(*pErrorCode);
     }
 }

 /* DerivedAge.txt ----------------------------------------------------------- */

 static void U_CALLCONV
 ageLineFn(void *context,
           char *fields[][2], int32_t fieldCount,
           UErrorCode *pErrorCode) {
     char *s, *numberLimit;
     uint32_t value, start, end, version;

     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
     if(U_FAILURE(*pErrorCode)) {
         fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]);
         exit(*pErrorCode);
     }

     /* ignore "unassigned" (the default is already set to 0.0) */
     s=(char *)u_skipWhitespace(fields[1][0]);
     if(0==uprv_strncmp(s, "unassigned", 10)) {
         return;
     }

     /* parse version number */
     value=(uint32_t)uprv_strtoul(s, &numberLimit, 10);
     if(s==numberLimit || value==0 || value>15 || (*numberLimit!='.' && *numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) {
         fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
         *pErrorCode=U_PARSE_ERROR;
         exit(U_PARSE_ERROR);
     }
     version=value<<4;

     /* parse minor version number */
     if(*numberLimit=='.') {
         s=(char *)u_skipWhitespace(numberLimit+1);
         value=(uint32_t)uprv_strtoul(s, &numberLimit, 10);
         if(s==numberLimit || value>15 || (*numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) {
             fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
             *pErrorCode=U_PARSE_ERROR;
             exit(U_PARSE_ERROR);
         }
         version|=value;
     }

     if(start==0 && end==0x10ffff) {
         /* Also set bits for initialValue and errorValue. */
         end=UPVEC_MAX_CP;
     }
     upvec_setValue(pv, start, end, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode);
     if(U_FAILURE(*pErrorCode)) {
         fprintf(stderr, "genprops error: unable to set character age: %s\n", u_errorName(*pErrorCode));
         exit(*pErrorCode);
     }
 }

 /* ScriptExtensions.txt ----------------------------------------------------- */

 static void U_CALLCONV
 scriptExtensionsLineFn(void *context,
                        char *fields[][2], int32_t fieldCount,
                        UErrorCode *pErrorCode) {
     uint32_t start, end;
     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
     if(U_FAILURE(*pErrorCode)) {
         fprintf(stderr, "genprops: syntax error in ScriptExtensions.txt field 0 at %s\n", fields[0][0]);
         exit(*pErrorCode);
     }

     /* parse list of script codes */
     UnicodeString codes;  // vector of 16-bit UScriptCode values
     char *s=fields[1][0];
     for(;;) {
         // skip whitespace before each token
         s=(char *)u_skipWhitespace(s);
         if(*s==0 || *s==';') {
             break;
         }
         // skip non-whitespace, non-terminator characters to find the token limit
         char *limit=s;
         char c;
         do {
             c=*++limit;
         } while(!U_IS_INV_WHITESPACE(c) && c!=0 && c!=';');
         // NUL-terminated this token
         *limit=0;
         // convert the token (script property value alias) into a UScriptCode value
         int32_t value=u_getPropertyValueEnum(UCHAR_SCRIPT, s);
         if(value<0) {
             fprintf(stderr, "genprops: syntax error in ScriptExtensions.txt field 1 at %s\n", s);
             exit(U_INVALID_FORMAT_ERROR);
         }
         // Insertion sort into the list of script codes.
         for(int32_t i=0;; ++i) {
             if(i<codes.length()) {
                 if(value<codes[i]) {
                     codes.insert(i, (UChar)value);
                     break;
                 } else if(value==codes[i]) {
                     fprintf(stderr,
                             "genprops: duplicate script code in ScriptExtensions.txt field 1 at %s "
                             "for U+%04lx..U+%04lx\n",
                             s, (long)start, (long)end);
                     exit(U_INVALID_FORMAT_ERROR);
                 }
                 // continue while value>codes[i]
             } else {
                 codes.append((UChar)value);
                 break;
             }
         }
         if(c==0 || c==';') {
             // the token ended at a terminator
             break;
         } else {
             // the token ended at U_IS_INV_WHITESPACE(c), continue after c
             s=limit+1;
         }
     }
     int32_t length=codes.length();
     if(length==0) {
         fprintf(stderr,
                 "genprops: missing values in ScriptExtensions.txt field 1 "
                 "for U+%04lx..U+%04lx\n",
                 (long)start, (long)end);
         exit(U_INVALID_FORMAT_ERROR);
     }
     // Set bit 15 on the last script code, for termination.
     codes.setCharAt(length-1, (UChar)(codes[length-1]|0x8000));
     // Find this list of codes in the Script_Extensions data so far, or add this list.
     int32_t index=scriptExtensions->indexOf(codes);
     if(index<0) {
         index=scriptExtensions->length();
         scriptExtensions->append(codes);
     }
     // Modify the Script data for each of the start..end code points
     // to include the Script_Extensions index.
     do {
         uint32_t scriptX=upvec_getValue(pv, (UChar32)start, 0)&UPROPS_SCRIPT_X_MASK;
         // Find the next code point that has a different script value.
         // We want to add the Script_Extensions index to the code point range start..next-1.
         UChar32 next;
         for(next=(UChar32)start+1;
             next<=(UChar32)end && scriptX==(upvec_getValue(pv, next, 0)&UPROPS_SCRIPT_X_MASK);
             ++next) {}
         if(scriptX>=UPROPS_SCRIPT_X_WITH_COMMON) {
             fprintf(stderr,
                     "genprops: ScriptExtensions.txt has values for U+%04lx..U+%04lx "
                     "which overlaps with a range including U+%04lx..U+%04lx\n",
                     (long)start, (long)end, (long)start, (long)(next-1));
             exit(U_INVALID_FORMAT_ERROR);
         }
         // Encode the (Script, Script_Extensions index) pair.
         if(scriptX==USCRIPT_COMMON) {
             scriptX=UPROPS_SCRIPT_X_WITH_COMMON|(uint32_t)index;
         } else if(scriptX==USCRIPT_INHERITED) {
             scriptX=UPROPS_SCRIPT_X_WITH_INHERITED|(uint32_t)index;
         } else {
             // Store an additional pair of 16-bit units for an unusual main Script code
             // together with the Script_Extensions index.
             UnicodeString codeIndexPair;
             codeIndexPair.append((UChar)scriptX).append((UChar)index);
             index=scriptExtensions->indexOf(codeIndexPair);
             if(index<0) {
                 index=scriptExtensions->length();
                 scriptExtensions->append(codeIndexPair);
             }
             scriptX=UPROPS_SCRIPT_X_WITH_OTHER|(uint32_t)index;
         }
         if(index>UPROPS_SCRIPT_MASK) {
             fprintf(stderr, "genprops: Script_Extensions indexes overflow bit field\n");
             exit(U_BUFFER_OVERFLOW_ERROR);
         }
         // Write the (Script, Script_Extensions index) pair into
         // the properties vector for start..next-1.
         upvec_setValue(pv, (UChar32)start, (UChar32)(next-1),
                         0, scriptX, UPROPS_SCRIPT_X_MASK, pErrorCode);
         if(U_FAILURE(*pErrorCode)) {
             fprintf(stderr, "genprops error: unable to set Script_Extensions: %s\n", u_errorName(*pErrorCode));
             exit(*pErrorCode);
         }
         start=next;
     } while(start<=end);
 }

 /* DerivedNumericValues.txt ------------------------------------------------- */

 static void U_CALLCONV
 numericLineFn(void *context,
               char *fields[][2], int32_t fieldCount,
               UErrorCode *pErrorCode) {
     Props newProps={ 0 };
     char *s, *numberLimit;
     uint32_t start, end, value, oldProps32;
     char c;
     UBool isFraction;

     /* get the code point range */
     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
     if(U_FAILURE(*pErrorCode)) {
         fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 0 at %s\n", fields[0][0]);
         exit(*pErrorCode);
     }

     /*
      * Ignore the
      * # @missing: 0000..10FFFF; NaN
      * line from Unicode 5.1's DerivedNumericValues.txt:
      * The following code cannot parse "NaN", and we don't want to overwrite
      * the numeric values for all characters after reading most
      * from UnicodeData.txt already.
      */
     if(start==0 && end==0x10ffff) {
         return;
     }

     /* check if the numeric value is a fraction (this code does not handle any) */
     isFraction=FALSE;
     s=uprv_strchr(fields[1][0], '.');
     if(s!=NULL) {
         numberLimit=s+1;
         while('0'<=(c=*numberLimit++) && c<='9') {
             if(c!='0') {
                 isFraction=TRUE;
                 break;
             }
         }
     }

     if(isFraction) {
         value=0;
     } else {
         /* parse numeric value */
         s=(char *)u_skipWhitespace(fields[1][0]);

         /* try large, single-significant-digit numbers, may otherwise overflow strtoul() */
         if('1'<=s[0] && s[0]<='9' && s[1]=='0' && s[2]=='0') {
             /* large integers are encoded in a special way, see store.c */
             uint8_t exp=0;

             value=s[0]-'0';
             numberLimit=s;
             while(*(++numberLimit)=='0') {
                 ++exp;
             }
             newProps.exponent=exp;
         } else {
             /* normal number parsing */
             value=(uint32_t)uprv_strtoul(s, &numberLimit, 10);
         }
         if(numberLimit<=s || (*numberLimit!='.' && u_skipWhitespace(numberLimit)!=fields[1][1]) || value>=0x80000000) {
             fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 1 at %s\n", fields[0][0]);
             exit(U_PARSE_ERROR);
         }
     }

     /*
      * Unicode 4.0.1 removes the third column that used to list the numeric type.
      * Assume that either the data is the same as in UnicodeData.txt,
      * or else that the numeric type is "numeric".
      * This should work because we only expect to add numeric values for
      * Han characters; for those, UnicodeData.txt lists only ranges without
      * specific properties for single characters.
      */

     /* set the new numeric value */
     newProps.code=start;
     newProps.numericValue=(int32_t)value;       /* newly parsed numeric value */
     /* the exponent may have been set above */

     for(; start<=end; ++start) {
         uint32_t newProps32;
         int32_t oldNtv;
         oldProps32=getProps(start);
         oldNtv=(int32_t)GET_NUMERIC_TYPE_VALUE(oldProps32);

         if(isFraction) {
             if(UPROPS_NTV_FRACTION_START<=oldNtv && oldNtv<UPROPS_NTV_LARGE_START) {
                 /* this code point was already listed with its numeric value in UnicodeData.txt */
                 continue;
             } else {
                 fprintf(stderr, "genprops: not prepared for new fractions in DerivedNumericValues.txt field 1 at %s\n", fields[1][0]);
                 exit(U_PARSE_ERROR);
             }
         }

         /*
          * For simplicity, and because we only expect to set numeric values for Han characters,
          * for now we only allow to set these values for Lo characters.
          */
         if(oldNtv==UPROPS_NTV_NONE && GET_CATEGORY(oldProps32)!=U_OTHER_LETTER) {
             fprintf(stderr, "genprops error: new numeric value for a character other than Lo in DerivedNumericValues.txt at %s\n", fields[0][0]);
             exit(U_PARSE_ERROR);
         }

         /* verify that we do not change an existing value (fractions were excluded above) */
         if(oldNtv!=UPROPS_NTV_NONE) {
             /* the code point already has a value stored */
             newProps.numericType=UPROPS_NTV_GET_TYPE(oldNtv);
             newProps32=makeProps(&newProps);
             if(oldNtv!=GET_NUMERIC_TYPE_VALUE(newProps32)) {
                 fprintf(stderr, "genprops error: new numeric value differs from old one for U+%04lx\n", (long)start);
                 exit(U_PARSE_ERROR);
             }
             /* same value, continue */
         } else {
             /* the code point is getting a new numeric value */
             newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */
             newProps32=makeProps(&newProps);
             if(beVerbose) {
                 printf("adding U+%04x numeric type %d encoded-numeric-type-value 0x%03x from %s\n",
                        (int)start, U_NT_NUMERIC, (int)GET_NUMERIC_TYPE_VALUE(newProps32), fields[0][0]);
             }

             addProps(start, newProps32|GET_CATEGORY(oldProps32));
         }
     }
 }

 /* data serialization ------------------------------------------------------- */

 U_CFUNC int32_t
 writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_COUNT]) {
     const uint32_t *pvArray;
     int32_t pvRows, pvCount;
     int32_t length;
     UErrorCode errorCode;

     pvArray=upvec_getArray(pv, &pvRows, NULL);
     pvCount=pvRows*UPROPS_VECTOR_WORDS;

     errorCode=U_ZERO_ERROR;
     length=utrie2_serialize(newTrie, p, capacity, &errorCode);
     if(U_FAILURE(errorCode)) {
         fprintf(stderr,
                 "genprops error: utrie2_freeze(additional properties)+utrie2_serialize() failed: %s\n",
                 u_errorName(errorCode));
         exit(errorCode);
     }

     /* round up scriptExtensions to multiple of 4 bytes */
     if(scriptExtensions->length()&1) {
         scriptExtensions->append((UChar)0);
     }

     /* set indexes */
     indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
         indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
     indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
     indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]=
         indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
     indexes[UPROPS_RESERVED_INDEX_7]=
         indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]+scriptExtensions->length()/2;
     indexes[UPROPS_RESERVED_INDEX_8]=indexes[UPROPS_RESERVED_INDEX_7];
     indexes[UPROPS_DATA_TOP_INDEX]=indexes[UPROPS_RESERVED_INDEX_8];

     indexes[UPROPS_MAX_VALUES_INDEX]=
         (((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
         (((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
         (((int32_t)USCRIPT_CODE_LIMIT-1)&UPROPS_SCRIPT_MASK);
     indexes[UPROPS_MAX_VALUES_2_INDEX]=
         (((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
         (((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
         (((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
         (((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
         ((int32_t)U_DT_COUNT-1);

     int32_t additionalPropsSize=4*(indexes[UPROPS_DATA_TOP_INDEX]-indexes[UPROPS_ADDITIONAL_TRIE_INDEX]);
     if(p!=NULL && additionalPropsSize<=capacity) {
         if(beVerbose) {
             printf("size in bytes of additional props trie:%5u\n", (int)length);
         }
         if(f!=NULL) {
             usrc_writeUTrie2Arrays(f,
                 "static const uint16_t propsVectorsTrie_index[%ld]={\n", NULL,
                 newTrie,
                 "\n};\n\n");
             usrc_writeUTrie2Struct(f,
                 "static const UTrie2 propsVectorsTrie={\n",
                 newTrie, "propsVectorsTrie_index", NULL,
                 "};\n\n");

             usrc_writeArray(f,
                 "static const uint32_t propsVectors[%ld]={\n",
                 pvArray, 32, pvCount,
                 "};\n\n");
             fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount);
             fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]);

             usrc_writeArray(f,
                 "static const uint16_t scriptExtensions[%ld]={\n",
                 scriptExtensions->getBuffer(), 16, scriptExtensions->length(),
                 "};\n\n");
         } else {
             p+=length;
             length=pvCount*4;
             uprv_memcpy(p, pvArray, length);

             p+=length;
             length=scriptExtensions->length()*2;
             uprv_memcpy(p, scriptExtensions->getBuffer(), length);
         }
         if(beVerbose) {
             printf("number of additional props vectors:    %5u\n", (int)pvRows);
             printf("number of 32-bit words per vector:     %5u\n", UPROPS_VECTOR_WORDS);
             printf("number of 16-bit scriptExtensions:     %5u\n", (int)scriptExtensions->length());
         }
     }

     return additionalPropsSize;
 }