| /* |
| ******************************************************************************* |
| * |
| * Copyright (C) 2002-2010, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ******************************************************************************* |
| * file name: props2.cpp |
| * encoding: US-ASCII |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2002feb24 |
| * created by: Markus W. Scherer |
| * |
| * Parse more Unicode Character Database files and store |
| * additional Unicode character properties in bit set vectors. |
| */ |
| |
| #include <stdio.h> |
| #include "unicode/utypes.h" |
| #include "unicode/uchar.h" |
| #include "unicode/unistr.h" |
| #include "unicode/uscript.h" |
| #include "cstring.h" |
| #include "cmemory.h" |
| #include "utrie2.h" |
| #include "uprops.h" |
| #include "propsvec.h" |
| #include "uparse.h" |
| #include "writesrc.h" |
| #include "genprops.h" |
| |
| #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
| |
| U_NAMESPACE_USE |
| |
| /* data --------------------------------------------------------------------- */ |
| |
| static UTrie2 *newTrie; |
| UPropsVectors *pv; |
| |
| static UnicodeString *scriptExtensions; |
| |
| /* miscellaneous ------------------------------------------------------------ */ |
| |
| static char * |
| trimTerminateField(char *s, char *limit) { |
| /* trim leading whitespace */ |
| s=(char *)u_skipWhitespace(s); |
| |
| /* trim trailing whitespace */ |
| while(s<limit && U_IS_INV_WHITESPACE(*(limit-1))) { |
| --limit; |
| } |
| *limit=0; |
| |
| return s; |
| } |
| |
| static void |
| parseTwoFieldFile(char *filename, char *basename, |
| const char *ucdFile, const char *suffix, |
| UParseLineFn *lineFn, |
| UErrorCode *pErrorCode) { |
| char *fields[2][2]; |
| |
| if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| return; |
| } |
| |
| writeUCDFilename(basename, ucdFile, suffix); |
| |
| u_parseDelimitedFile(filename, ';', fields, 2, lineFn, NULL, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode)); |
| } |
| } |
| |
| static void U_CALLCONV |
| ageLineFn(void *context, |
| char *fields[][2], int32_t fieldCount, |
| UErrorCode *pErrorCode); |
| |
| static void U_CALLCONV |
| scriptExtensionsLineFn(void *context, |
| char *fields[][2], int32_t fieldCount, |
| UErrorCode *pErrorCode); |
| |
| static void |
| parseMultiFieldFile(char *filename, char *basename, |
| const char *ucdFile, const char *suffix, |
| int32_t fieldCount, |
| UParseLineFn *lineFn, |
| UErrorCode *pErrorCode) { |
| char *fields[20][2]; |
| |
| if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| return; |
| } |
| |
| writeUCDFilename(basename, ucdFile, suffix); |
| |
| u_parseDelimitedFile(filename, ';', fields, fieldCount, lineFn, NULL, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode)); |
| } |
| } |
| |
| static void U_CALLCONV |
| numericLineFn(void *context, |
| char *fields[][2], int32_t fieldCount, |
| UErrorCode *pErrorCode); |
| |
| /* parse files with single enumerated properties ---------------------------- */ |
| |
| struct SingleEnum { |
| const char *ucdFile, *propName; |
| UProperty prop; |
| int32_t vecWord, vecShift; |
| uint32_t vecMask; |
| }; |
| typedef struct SingleEnum SingleEnum; |
| |
| static void |
| parseSingleEnumFile(char *filename, char *basename, const char *suffix, |
| const SingleEnum *sen, |
| UErrorCode *pErrorCode); |
| |
| static const SingleEnum scriptSingleEnum={ |
| "Scripts", "script", |
| UCHAR_SCRIPT, |
| 0, 0, UPROPS_SCRIPT_MASK |
| }; |
| |
| static const SingleEnum blockSingleEnum={ |
| "Blocks", "block", |
| UCHAR_BLOCK, |
| 0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK |
| }; |
| |
| static const SingleEnum graphemeClusterBreakSingleEnum={ |
| "GraphemeBreakProperty", "Grapheme_Cluster_Break", |
| UCHAR_GRAPHEME_CLUSTER_BREAK, |
| 2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK |
| }; |
| |
| static const SingleEnum wordBreakSingleEnum={ |
| "WordBreakProperty", "Word_Break", |
| UCHAR_WORD_BREAK, |
| 2, UPROPS_WB_SHIFT, UPROPS_WB_MASK |
| }; |
| |
| static const SingleEnum sentenceBreakSingleEnum={ |
| "SentenceBreakProperty", "Sentence_Break", |
| UCHAR_SENTENCE_BREAK, |
| 2, UPROPS_SB_SHIFT, UPROPS_SB_MASK |
| }; |
| |
| static const SingleEnum lineBreakSingleEnum={ |
| "LineBreak", "line break", |
| UCHAR_LINE_BREAK, |
| 2, UPROPS_LB_SHIFT, UPROPS_LB_MASK |
| }; |
| |
| static const SingleEnum eawSingleEnum={ |
| "EastAsianWidth", "east asian width", |
| UCHAR_EAST_ASIAN_WIDTH, |
| 0, UPROPS_EA_SHIFT, UPROPS_EA_MASK |
| }; |
| |
| static void U_CALLCONV |
| singleEnumLineFn(void *context, |
| char *fields[][2], int32_t fieldCount, |
| UErrorCode *pErrorCode) { |
| const SingleEnum *sen; |
| char *s; |
| uint32_t start, end, uv; |
| int32_t value; |
| |
| sen=(const SingleEnum *)context; |
| |
| u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", sen->ucdFile, fields[0][0]); |
| exit(*pErrorCode); |
| } |
| |
| /* parse property alias */ |
| s=trimTerminateField(fields[1][0], fields[1][1]); |
| value=u_getPropertyValueEnum(sen->prop, s); |
| if(value<0) { |
| if(sen->prop==UCHAR_BLOCK) { |
| if(isToken("Greek", s)) { |
| value=UBLOCK_GREEK; /* Unicode 3.2 renames this to "Greek and Coptic" */ |
| } else if(isToken("Combining Marks for Symbols", s)) { |
| value=UBLOCK_COMBINING_MARKS_FOR_SYMBOLS; /* Unicode 3.2 renames this to "Combining Diacritical Marks for Symbols" */ |
| } else if(isToken("Private Use", s)) { |
| value=UBLOCK_PRIVATE_USE; /* Unicode 3.2 renames this to "Private Use Area" */ |
| } |
| } |
| } |
| if(value<0) { |
| fprintf(stderr, "genprops error: unknown %s name in %s.txt field 1 at %s\n", |
| sen->propName, sen->ucdFile, s); |
| exit(U_PARSE_ERROR); |
| } |
| |
| uv=(uint32_t)(value<<sen->vecShift); |
| if((uv&sen->vecMask)!=uv) { |
| fprintf(stderr, "genprops error: %s value overflow (0x%x) at %s\n", |
| sen->propName, (int)uv, s); |
| exit(U_INTERNAL_PROGRAM_ERROR); |
| } |
| |
| if(start==0 && end==0x10ffff) { |
| /* Also set bits for initialValue and errorValue. */ |
| end=UPVEC_MAX_CP; |
| } |
| upvec_setValue(pv, start, end, sen->vecWord, uv, sen->vecMask, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| fprintf(stderr, "genprops error: unable to set %s code: %s\n", |
| sen->propName, u_errorName(*pErrorCode)); |
| exit(*pErrorCode); |
| } |
| } |
| |
| static void |
| parseSingleEnumFile(char *filename, char *basename, const char *suffix, |
| const SingleEnum *sen, |
| UErrorCode *pErrorCode) { |
| char *fields[2][2]; |
| |
| if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| return; |
| } |
| |
| writeUCDFilename(basename, sen->ucdFile, suffix); |
| |
| u_parseDelimitedFile(filename, ';', fields, 2, singleEnumLineFn, (void *)sen, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| fprintf(stderr, "error parsing %s.txt: %s\n", sen->ucdFile, u_errorName(*pErrorCode)); |
| } |
| } |
| |
| /* parse files with multiple binary properties ------------------------------ */ |
| |
| struct Binary { |
| const char *propName; |
| int32_t vecWord, vecShift; |
| }; |
| typedef struct Binary Binary; |
| |
| struct Binaries { |
| const char *ucdFile; |
| const Binary *binaries; |
| int32_t binariesCount; |
| }; |
| typedef struct Binaries Binaries; |
| |
| static const Binary |
| propListNames[]={ |
| { "White_Space", 1, UPROPS_WHITE_SPACE }, |
| { "Dash", 1, UPROPS_DASH }, |
| { "Hyphen", 1, UPROPS_HYPHEN }, |
| { "Quotation_Mark", 1, UPROPS_QUOTATION_MARK }, |
| { "Terminal_Punctuation", 1, UPROPS_TERMINAL_PUNCTUATION }, |
| { "Hex_Digit", 1, UPROPS_HEX_DIGIT }, |
| { "ASCII_Hex_Digit", 1, UPROPS_ASCII_HEX_DIGIT }, |
| { "Ideographic", 1, UPROPS_IDEOGRAPHIC }, |
| { "Diacritic", 1, UPROPS_DIACRITIC }, |
| { "Extender", 1, UPROPS_EXTENDER }, |
| { "Noncharacter_Code_Point", 1, UPROPS_NONCHARACTER_CODE_POINT }, |
| { "Grapheme_Link", 1, UPROPS_GRAPHEME_LINK }, |
| { "IDS_Binary_Operator", 1, UPROPS_IDS_BINARY_OPERATOR }, |
| { "IDS_Trinary_Operator", 1, UPROPS_IDS_TRINARY_OPERATOR }, |
| { "Radical", 1, UPROPS_RADICAL }, |
| { "Unified_Ideograph", 1, UPROPS_UNIFIED_IDEOGRAPH }, |
| { "Deprecated", 1, UPROPS_DEPRECATED }, |
| { "Logical_Order_Exception", 1, UPROPS_LOGICAL_ORDER_EXCEPTION }, |
| |
| /* new properties in Unicode 4.0.1 */ |
| { "STerm", 1, UPROPS_S_TERM }, |
| { "Variation_Selector", 1, UPROPS_VARIATION_SELECTOR }, |
| |
| /* new properties in Unicode 4.1 */ |
| { "Pattern_Syntax", 1, UPROPS_PATTERN_SYNTAX }, |
| { "Pattern_White_Space", 1, UPROPS_PATTERN_WHITE_SPACE } |
| }; |
| |
| static const Binaries |
| propListBinaries={ |
| "PropList", propListNames, LENGTHOF(propListNames) |
| }; |
| |
| static const Binary |
| derCorePropsNames[]={ |
| { "XID_Start", 1, UPROPS_XID_START }, |
| { "XID_Continue", 1, UPROPS_XID_CONTINUE }, |
| |
| /* before Unicode 4/ICU 2.6/format version 3.2, these used to be Other_XYZ from PropList.txt */ |
| { "Math", 1, UPROPS_MATH }, |
| { "Alphabetic", 1, UPROPS_ALPHABETIC }, |
| { "Grapheme_Extend", 1, UPROPS_GRAPHEME_EXTEND }, |
| { "Default_Ignorable_Code_Point", 1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT }, |
| |
| /* new properties bits in ICU 2.6/format version 3.2 */ |
| { "ID_Start", 1, UPROPS_ID_START }, |
| { "ID_Continue", 1, UPROPS_ID_CONTINUE }, |
| { "Grapheme_Base", 1, UPROPS_GRAPHEME_BASE }, |
| |
| /* |
| * Unicode 5/ICU 3.6 moves Grapheme_Link from PropList.txt |
| * to DerivedCoreProperties.txt and deprecates it. |
| */ |
| { "Grapheme_Link", 1, UPROPS_GRAPHEME_LINK } |
| }; |
| |
| static const Binaries |
| derCorePropsBinaries={ |
| "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames) |
| }; |
| |
| static char ignoredProps[100][64]; |
| static int32_t ignoredPropsCount; |
| |
| static void |
| addIgnoredProp(char *s, char *limit) { |
| int32_t i; |
| |
| s=trimTerminateField(s, limit); |
| for(i=0; i<ignoredPropsCount; ++i) { |
| if(0==uprv_strcmp(ignoredProps[i], s)) { |
| return; |
| } |
| } |
| uprv_strcpy(ignoredProps[ignoredPropsCount++], s); |
| } |
| |
| static void U_CALLCONV |
| binariesLineFn(void *context, |
| char *fields[][2], int32_t fieldCount, |
| UErrorCode *pErrorCode) { |
| const Binaries *bin; |
| char *s; |
| uint32_t start, end, uv; |
| int32_t i; |
| |
| bin=(const Binaries *)context; |
| |
| u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]); |
| exit(*pErrorCode); |
| } |
| |
| /* parse binary property name */ |
| s=(char *)u_skipWhitespace(fields[1][0]); |
| for(i=0;; ++i) { |
| if(i==bin->binariesCount) { |
| /* ignore unrecognized properties */ |
| if(beVerbose) { |
| addIgnoredProp(s, fields[1][1]); |
| } |
| return; |
| } |
| if(isToken(bin->binaries[i].propName, s)) { |
| break; |
| } |
| } |
| |
| if(bin->binaries[i].vecShift>=32) { |
| fprintf(stderr, "genprops error: shift value %d>=32 for %s %s\n", |
| (int)bin->binaries[i].vecShift, bin->ucdFile, bin->binaries[i].propName); |
| exit(U_INTERNAL_PROGRAM_ERROR); |
| } |
| uv=U_MASK(bin->binaries[i].vecShift); |
| |
| if(start==0 && end==0x10ffff) { |
| /* Also set bits for initialValue and errorValue. */ |
| end=UPVEC_MAX_CP; |
| } |
| upvec_setValue(pv, start, end, bin->binaries[i].vecWord, uv, uv, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| fprintf(stderr, "genprops error: unable to set %s code: %s\n", |
| bin->binaries[i].propName, u_errorName(*pErrorCode)); |
| exit(*pErrorCode); |
| } |
| } |
| |
| static void |
| parseBinariesFile(char *filename, char *basename, const char *suffix, |
| const Binaries *bin, |
| UErrorCode *pErrorCode) { |
| char *fields[2][2]; |
| int32_t i; |
| |
| if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| return; |
| } |
| |
| writeUCDFilename(basename, bin->ucdFile, suffix); |
| |
| ignoredPropsCount=0; |
| |
| u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode)); |
| } |
| |
| if(beVerbose) { |
| for(i=0; i<ignoredPropsCount; ++i) { |
| printf("genprops: ignoring property %s in %s.txt\n", ignoredProps[i], bin->ucdFile); |
| } |
| } |
| } |
| |
| /* -------------------------------------------------------------------------- */ |
| |
| U_CFUNC void |
| initAdditionalProperties() { |
| UErrorCode errorCode=U_ZERO_ERROR; |
| pv=upvec_open(UPROPS_VECTOR_WORDS, &errorCode); |
| if(U_FAILURE(errorCode)) { |
| fprintf(stderr, "error: upvec_open() failed - %s\n", u_errorName(errorCode)); |
| exit(errorCode); |
| } |
| scriptExtensions=new UnicodeString; |
| } |
| |
| U_CFUNC void |
| exitAdditionalProperties() { |
| utrie2_close(newTrie); |
| upvec_close(pv); |
| delete scriptExtensions; |
| } |
| |
| U_CFUNC void |
| generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) { |
| char *basename; |
| |
| basename=filename+uprv_strlen(filename); |
| |
| /* process various UCD .txt files */ |
| |
| /* add Han numeric types & values */ |
| parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode); |
| |
| parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode); |
| |
| parseSingleEnumFile(filename, basename, suffix, &scriptSingleEnum, pErrorCode); |
| |
| parseTwoFieldFile(filename, basename, "ScriptExtensions", suffix, scriptExtensionsLineFn, pErrorCode); |
| |
| parseSingleEnumFile(filename, basename, suffix, &blockSingleEnum, pErrorCode); |
| |
| parseBinariesFile(filename, basename, suffix, &propListBinaries, pErrorCode); |
| |
| parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, pErrorCode); |
| |
| parseSingleEnumFile(filename, basename, suffix, &graphemeClusterBreakSingleEnum, pErrorCode); |
| |
| parseSingleEnumFile(filename, basename, suffix, &wordBreakSingleEnum, pErrorCode); |
| |
| parseSingleEnumFile(filename, basename, suffix, &sentenceBreakSingleEnum, pErrorCode); |
| |
| /* |
| * LineBreak-4.0.0.txt: |
| * - All code points, assigned and unassigned, that are not listed |
| * explicitly are given the value "XX". |
| * |
| * XX==U_LB_UNKNOWN==0 - nothing to do |
| */ |
| parseSingleEnumFile(filename, basename, suffix, &lineBreakSingleEnum, pErrorCode); |
| |
| /* |
| * Preset East Asian Width defaults: |
| * |
| * http://www.unicode.org/reports/tr11/#Unassigned |
| * 7.1 Unassigned and Private Use characters |
| * |
| * All unassigned characters are by default classified as non-East Asian neutral, |
| * except for the range U+20000 to U+2FFFD, |
| * since all code positions from U+20000 to U+2FFFD are intended for CJK ideographs (W). |
| * All Private use characters are by default classified as ambiguous, |
| * since their definition depends on context. |
| * |
| * N for all ==0 - nothing to do |
| * A for Private Use |
| * W for plane 2 |
| */ |
| *pErrorCode=U_ZERO_ERROR; |
| upvec_setValue(pv, 0xe000, 0xf8ff, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode); |
| upvec_setValue(pv, 0xf0000, 0xffffd, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode); |
| upvec_setValue(pv, 0x100000, 0x10fffd, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode); |
| upvec_setValue(pv, 0x20000, 0x2fffd, 0, (uint32_t)(U_EA_WIDE<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| fprintf(stderr, "genprops: unable to set default East Asian Widths: %s\n", u_errorName(*pErrorCode)); |
| exit(*pErrorCode); |
| } |
| |
| /* parse EastAsianWidth.txt */ |
| parseSingleEnumFile(filename, basename, suffix, &eawSingleEnum, pErrorCode); |
| |
| newTrie=upvec_compactToUTrie2WithRowIndexes(pv, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n", |
| u_errorName(*pErrorCode)); |
| exit(*pErrorCode); |
| } |
| } |
| |
| /* DerivedAge.txt ----------------------------------------------------------- */ |
| |
| static void U_CALLCONV |
| ageLineFn(void *context, |
| char *fields[][2], int32_t fieldCount, |
| UErrorCode *pErrorCode) { |
| char *s, *numberLimit; |
| uint32_t value, start, end, version; |
| |
| u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]); |
| exit(*pErrorCode); |
| } |
| |
| /* ignore "unassigned" (the default is already set to 0.0) */ |
| s=(char *)u_skipWhitespace(fields[1][0]); |
| if(0==uprv_strncmp(s, "unassigned", 10)) { |
| return; |
| } |
| |
| /* parse version number */ |
| value=(uint32_t)uprv_strtoul(s, &numberLimit, 10); |
| if(s==numberLimit || value==0 || value>15 || (*numberLimit!='.' && *numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) { |
| fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]); |
| *pErrorCode=U_PARSE_ERROR; |
| exit(U_PARSE_ERROR); |
| } |
| version=value<<4; |
| |
| /* parse minor version number */ |
| if(*numberLimit=='.') { |
| s=(char *)u_skipWhitespace(numberLimit+1); |
| value=(uint32_t)uprv_strtoul(s, &numberLimit, 10); |
| if(s==numberLimit || value>15 || (*numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) { |
| fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]); |
| *pErrorCode=U_PARSE_ERROR; |
| exit(U_PARSE_ERROR); |
| } |
| version|=value; |
| } |
| |
| if(start==0 && end==0x10ffff) { |
| /* Also set bits for initialValue and errorValue. */ |
| end=UPVEC_MAX_CP; |
| } |
| upvec_setValue(pv, start, end, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| fprintf(stderr, "genprops error: unable to set character age: %s\n", u_errorName(*pErrorCode)); |
| exit(*pErrorCode); |
| } |
| } |
| |
| /* ScriptExtensions.txt ----------------------------------------------------- */ |
| |
| static void U_CALLCONV |
| scriptExtensionsLineFn(void *context, |
| char *fields[][2], int32_t fieldCount, |
| UErrorCode *pErrorCode) { |
| uint32_t start, end; |
| u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| fprintf(stderr, "genprops: syntax error in ScriptExtensions.txt field 0 at %s\n", fields[0][0]); |
| exit(*pErrorCode); |
| } |
| |
| /* parse list of script codes */ |
| UnicodeString codes; // vector of 16-bit UScriptCode values |
| char *s=fields[1][0]; |
| for(;;) { |
| // skip whitespace before each token |
| s=(char *)u_skipWhitespace(s); |
| if(*s==0 || *s==';') { |
| break; |
| } |
| // skip non-whitespace, non-terminator characters to find the token limit |
| char *limit=s; |
| char c; |
| do { |
| c=*++limit; |
| } while(!U_IS_INV_WHITESPACE(c) && c!=0 && c!=';'); |
| // NUL-terminated this token |
| *limit=0; |
| // convert the token (script property value alias) into a UScriptCode value |
| int32_t value=u_getPropertyValueEnum(UCHAR_SCRIPT, s); |
| if(value<0) { |
| fprintf(stderr, "genprops: syntax error in ScriptExtensions.txt field 1 at %s\n", s); |
| exit(U_INVALID_FORMAT_ERROR); |
| } |
| // Insertion sort into the list of script codes. |
| for(int32_t i=0;; ++i) { |
| if(i<codes.length()) { |
| if(value<codes[i]) { |
| codes.insert(i, (UChar)value); |
| break; |
| } else if(value==codes[i]) { |
| fprintf(stderr, |
| "genprops: duplicate script code in ScriptExtensions.txt field 1 at %s " |
| "for U+%04lx..U+%04lx\n", |
| s, (long)start, (long)end); |
| exit(U_INVALID_FORMAT_ERROR); |
| } |
| // continue while value>codes[i] |
| } else { |
| codes.append((UChar)value); |
| break; |
| } |
| } |
| if(c==0 || c==';') { |
| // the token ended at a terminator |
| break; |
| } else { |
| // the token ended at U_IS_INV_WHITESPACE(c), continue after c |
| s=limit+1; |
| } |
| } |
| int32_t length=codes.length(); |
| if(length==0) { |
| fprintf(stderr, |
| "genprops: missing values in ScriptExtensions.txt field 1 " |
| "for U+%04lx..U+%04lx\n", |
| (long)start, (long)end); |
| exit(U_INVALID_FORMAT_ERROR); |
| } |
| // Set bit 15 on the last script code, for termination. |
| codes.setCharAt(length-1, (UChar)(codes[length-1]|0x8000)); |
| // Find this list of codes in the Script_Extensions data so far, or add this list. |
| int32_t index=scriptExtensions->indexOf(codes); |
| if(index<0) { |
| index=scriptExtensions->length(); |
| scriptExtensions->append(codes); |
| } |
| // Modify the Script data for each of the start..end code points |
| // to include the Script_Extensions index. |
| do { |
| uint32_t scriptX=upvec_getValue(pv, (UChar32)start, 0)&UPROPS_SCRIPT_X_MASK; |
| // Find the next code point that has a different script value. |
| // We want to add the Script_Extensions index to the code point range start..next-1. |
| UChar32 next; |
| for(next=(UChar32)start+1; |
| next<=(UChar32)end && scriptX==(upvec_getValue(pv, next, 0)&UPROPS_SCRIPT_X_MASK); |
| ++next) {} |
| if(scriptX>=UPROPS_SCRIPT_X_WITH_COMMON) { |
| fprintf(stderr, |
| "genprops: ScriptExtensions.txt has values for U+%04lx..U+%04lx " |
| "which overlaps with a range including U+%04lx..U+%04lx\n", |
| (long)start, (long)end, (long)start, (long)(next-1)); |
| exit(U_INVALID_FORMAT_ERROR); |
| } |
| // Encode the (Script, Script_Extensions index) pair. |
| if(scriptX==USCRIPT_COMMON) { |
| scriptX=UPROPS_SCRIPT_X_WITH_COMMON|(uint32_t)index; |
| } else if(scriptX==USCRIPT_INHERITED) { |
| scriptX=UPROPS_SCRIPT_X_WITH_INHERITED|(uint32_t)index; |
| } else { |
| // Store an additional pair of 16-bit units for an unusual main Script code |
| // together with the Script_Extensions index. |
| UnicodeString codeIndexPair; |
| codeIndexPair.append((UChar)scriptX).append((UChar)index); |
| index=scriptExtensions->indexOf(codeIndexPair); |
| if(index<0) { |
| index=scriptExtensions->length(); |
| scriptExtensions->append(codeIndexPair); |
| } |
| scriptX=UPROPS_SCRIPT_X_WITH_OTHER|(uint32_t)index; |
| } |
| if(index>UPROPS_SCRIPT_MASK) { |
| fprintf(stderr, "genprops: Script_Extensions indexes overflow bit field\n"); |
| exit(U_BUFFER_OVERFLOW_ERROR); |
| } |
| // Write the (Script, Script_Extensions index) pair into |
| // the properties vector for start..next-1. |
| upvec_setValue(pv, (UChar32)start, (UChar32)(next-1), |
| 0, scriptX, UPROPS_SCRIPT_X_MASK, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| fprintf(stderr, "genprops error: unable to set Script_Extensions: %s\n", u_errorName(*pErrorCode)); |
| exit(*pErrorCode); |
| } |
| start=next; |
| } while(start<=end); |
| } |
| |
| /* DerivedNumericValues.txt ------------------------------------------------- */ |
| |
| static void U_CALLCONV |
| numericLineFn(void *context, |
| char *fields[][2], int32_t fieldCount, |
| UErrorCode *pErrorCode) { |
| Props newProps={ 0 }; |
| char *s, *numberLimit; |
| uint32_t start, end, value, oldProps32; |
| char c; |
| UBool isFraction; |
| |
| /* get the code point range */ |
| u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 0 at %s\n", fields[0][0]); |
| exit(*pErrorCode); |
| } |
| |
| /* |
| * Ignore the |
| * # @missing: 0000..10FFFF; NaN |
| * line from Unicode 5.1's DerivedNumericValues.txt: |
| * The following code cannot parse "NaN", and we don't want to overwrite |
| * the numeric values for all characters after reading most |
| * from UnicodeData.txt already. |
| */ |
| if(start==0 && end==0x10ffff) { |
| return; |
| } |
| |
| /* check if the numeric value is a fraction (this code does not handle any) */ |
| isFraction=FALSE; |
| s=uprv_strchr(fields[1][0], '.'); |
| if(s!=NULL) { |
| numberLimit=s+1; |
| while('0'<=(c=*numberLimit++) && c<='9') { |
| if(c!='0') { |
| isFraction=TRUE; |
| break; |
| } |
| } |
| } |
| |
| if(isFraction) { |
| value=0; |
| } else { |
| /* parse numeric value */ |
| s=(char *)u_skipWhitespace(fields[1][0]); |
| |
| /* try large, single-significant-digit numbers, may otherwise overflow strtoul() */ |
| if('1'<=s[0] && s[0]<='9' && s[1]=='0' && s[2]=='0') { |
| /* large integers are encoded in a special way, see store.c */ |
| uint8_t exp=0; |
| |
| value=s[0]-'0'; |
| numberLimit=s; |
| while(*(++numberLimit)=='0') { |
| ++exp; |
| } |
| newProps.exponent=exp; |
| } else { |
| /* normal number parsing */ |
| value=(uint32_t)uprv_strtoul(s, &numberLimit, 10); |
| } |
| if(numberLimit<=s || (*numberLimit!='.' && u_skipWhitespace(numberLimit)!=fields[1][1]) || value>=0x80000000) { |
| fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 1 at %s\n", fields[0][0]); |
| exit(U_PARSE_ERROR); |
| } |
| } |
| |
| /* |
| * Unicode 4.0.1 removes the third column that used to list the numeric type. |
| * Assume that either the data is the same as in UnicodeData.txt, |
| * or else that the numeric type is "numeric". |
| * This should work because we only expect to add numeric values for |
| * Han characters; for those, UnicodeData.txt lists only ranges without |
| * specific properties for single characters. |
| */ |
| |
| /* set the new numeric value */ |
| newProps.code=start; |
| newProps.numericValue=(int32_t)value; /* newly parsed numeric value */ |
| /* the exponent may have been set above */ |
| |
| for(; start<=end; ++start) { |
| uint32_t newProps32; |
| int32_t oldNtv; |
| oldProps32=getProps(start); |
| oldNtv=(int32_t)GET_NUMERIC_TYPE_VALUE(oldProps32); |
| |
| if(isFraction) { |
| if(UPROPS_NTV_FRACTION_START<=oldNtv && oldNtv<UPROPS_NTV_LARGE_START) { |
| /* this code point was already listed with its numeric value in UnicodeData.txt */ |
| continue; |
| } else { |
| fprintf(stderr, "genprops: not prepared for new fractions in DerivedNumericValues.txt field 1 at %s\n", fields[1][0]); |
| exit(U_PARSE_ERROR); |
| } |
| } |
| |
| /* |
| * For simplicity, and because we only expect to set numeric values for Han characters, |
| * for now we only allow to set these values for Lo characters. |
| */ |
| if(oldNtv==UPROPS_NTV_NONE && GET_CATEGORY(oldProps32)!=U_OTHER_LETTER) { |
| fprintf(stderr, "genprops error: new numeric value for a character other than Lo in DerivedNumericValues.txt at %s\n", fields[0][0]); |
| exit(U_PARSE_ERROR); |
| } |
| |
| /* verify that we do not change an existing value (fractions were excluded above) */ |
| if(oldNtv!=UPROPS_NTV_NONE) { |
| /* the code point already has a value stored */ |
| newProps.numericType=UPROPS_NTV_GET_TYPE(oldNtv); |
| newProps32=makeProps(&newProps); |
| if(oldNtv!=GET_NUMERIC_TYPE_VALUE(newProps32)) { |
| fprintf(stderr, "genprops error: new numeric value differs from old one for U+%04lx\n", (long)start); |
| exit(U_PARSE_ERROR); |
| } |
| /* same value, continue */ |
| } else { |
| /* the code point is getting a new numeric value */ |
| newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */ |
| newProps32=makeProps(&newProps); |
| if(beVerbose) { |
| printf("adding U+%04x numeric type %d encoded-numeric-type-value 0x%03x from %s\n", |
| (int)start, U_NT_NUMERIC, (int)GET_NUMERIC_TYPE_VALUE(newProps32), fields[0][0]); |
| } |
| |
| addProps(start, newProps32|GET_CATEGORY(oldProps32)); |
| } |
| } |
| } |
| |
| /* data serialization ------------------------------------------------------- */ |
| |
| U_CFUNC int32_t |
| writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_COUNT]) { |
| const uint32_t *pvArray; |
| int32_t pvRows, pvCount; |
| int32_t length; |
| UErrorCode errorCode; |
| |
| pvArray=upvec_getArray(pv, &pvRows, NULL); |
| pvCount=pvRows*UPROPS_VECTOR_WORDS; |
| |
| errorCode=U_ZERO_ERROR; |
| length=utrie2_serialize(newTrie, p, capacity, &errorCode); |
| if(U_FAILURE(errorCode)) { |
| fprintf(stderr, |
| "genprops error: utrie2_freeze(additional properties)+utrie2_serialize() failed: %s\n", |
| u_errorName(errorCode)); |
| exit(errorCode); |
| } |
| |
| /* round up scriptExtensions to multiple of 4 bytes */ |
| if(scriptExtensions->length()&1) { |
| scriptExtensions->append((UChar)0); |
| } |
| |
| /* set indexes */ |
| indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]= |
| indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4; |
| indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS; |
| indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]= |
| indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount; |
| indexes[UPROPS_RESERVED_INDEX_7]= |
| indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]+scriptExtensions->length()/2; |
| indexes[UPROPS_RESERVED_INDEX_8]=indexes[UPROPS_RESERVED_INDEX_7]; |
| indexes[UPROPS_DATA_TOP_INDEX]=indexes[UPROPS_RESERVED_INDEX_8]; |
| |
| indexes[UPROPS_MAX_VALUES_INDEX]= |
| (((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)| |
| (((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)| |
| (((int32_t)USCRIPT_CODE_LIMIT-1)&UPROPS_SCRIPT_MASK); |
| indexes[UPROPS_MAX_VALUES_2_INDEX]= |
| (((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)| |
| (((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)| |
| (((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)| |
| (((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)| |
| ((int32_t)U_DT_COUNT-1); |
| |
| int32_t additionalPropsSize=4*(indexes[UPROPS_DATA_TOP_INDEX]-indexes[UPROPS_ADDITIONAL_TRIE_INDEX]); |
| if(p!=NULL && additionalPropsSize<=capacity) { |
| if(beVerbose) { |
| printf("size in bytes of additional props trie:%5u\n", (int)length); |
| } |
| if(f!=NULL) { |
| usrc_writeUTrie2Arrays(f, |
| "static const uint16_t propsVectorsTrie_index[%ld]={\n", NULL, |
| newTrie, |
| "\n};\n\n"); |
| usrc_writeUTrie2Struct(f, |
| "static const UTrie2 propsVectorsTrie={\n", |
| newTrie, "propsVectorsTrie_index", NULL, |
| "};\n\n"); |
| |
| usrc_writeArray(f, |
| "static const uint32_t propsVectors[%ld]={\n", |
| pvArray, 32, pvCount, |
| "};\n\n"); |
| fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount); |
| fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]); |
| |
| usrc_writeArray(f, |
| "static const uint16_t scriptExtensions[%ld]={\n", |
| scriptExtensions->getBuffer(), 16, scriptExtensions->length(), |
| "};\n\n"); |
| } else { |
| p+=length; |
| length=pvCount*4; |
| uprv_memcpy(p, pvArray, length); |
| |
| p+=length; |
| length=scriptExtensions->length()*2; |
| uprv_memcpy(p, scriptExtensions->getBuffer(), length); |
| } |
| if(beVerbose) { |
| printf("number of additional props vectors: %5u\n", (int)pvRows); |
| printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS); |
| printf("number of 16-bit scriptExtensions: %5u\n", (int)scriptExtensions->length()); |
| } |
| } |
| |
| return additionalPropsSize; |
| } |