| /* |
| ******************************************************************************* |
| * |
| * Copyright (C) 2001, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ******************************************************************************* |
| * file name: gennorm.c |
| * encoding: US-ASCII |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2001may25 |
| * created by: Markus W. Scherer |
| * |
| * This program reads the Unicode character database text file, |
| * parses it, and extracts the data for normalization. |
| * It then preprocesses it and writes a binary file for efficient use |
| * in various Unicode text normalization processes. |
| */ |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include "unicode/utypes.h" |
| #include "unicode/uchar.h" |
| #include "unicode/putil.h" |
| #include "cmemory.h" |
| #include "cstring.h" |
| #include "unicode/udata.h" |
| #include "unewdata.h" |
| #include "uoptions.h" |
| #include "uparse.h" |
| #include "unormimp.h" |
| |
| U_CDECL_BEGIN |
| #include "gennorm.h" |
| U_CDECL_END |
| |
| #ifdef WIN32 |
| # pragma warning(disable: 4100) |
| #endif |
| |
| UBool beVerbose=FALSE, haveCopyright=TRUE; |
| |
| /* prototypes --------------------------------------------------------------- */ |
| |
| static void |
| parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode); |
| |
| static void |
| parseDB(const char *filename, UErrorCode *pErrorCode); |
| |
| /* -------------------------------------------------------------------------- */ |
| |
| static UOption options[]={ |
| UOPTION_HELP_H, |
| UOPTION_HELP_QUESTION_MARK, |
| UOPTION_VERBOSE, |
| UOPTION_COPYRIGHT, |
| UOPTION_DESTDIR, |
| UOPTION_SOURCEDIR, |
| { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 } |
| }; |
| |
| extern int |
| main(int argc, char* argv[]) { |
| char filename[300]; |
| const char *srcDir=NULL, *destDir=NULL, *suffix=NULL; |
| char *basename=NULL; |
| UErrorCode errorCode=U_ZERO_ERROR; |
| |
| /* preset then read command line options */ |
| options[4].value=u_getDataDirectory(); |
| options[5].value=""; |
| options[6].value="3.0.0"; |
| argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); |
| |
| /* error handling, printing usage message */ |
| if(argc<0) { |
| fprintf(stderr, |
| "error in command line argument \"%s\"\n", |
| argv[-argc]); |
| } |
| if(argc<0 || options[0].doesOccur || options[1].doesOccur) { |
| /* |
| * Broken into chucks because the C89 standard says the minimum |
| * required supported string length is 509 bytes. |
| */ |
| fprintf(stderr, |
| "Usage: %s [-options] [suffix]\n" |
| "\n" |
| "Read the UnicodeData.txt file and other Unicode properties files and\n" |
| "create a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n" |
| "\n", |
| argv[0]); |
| fprintf(stderr, |
| "Options:\n" |
| "\t-h or -? or --help this usage text\n" |
| "\t-v or --verbose verbose output\n" |
| "\t-c or --copyright include a copyright notice\n" |
| "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"); |
| fprintf(stderr, |
| "\t-d or --destdir destination directory, followed by the path\n" |
| "\t-s or --sourcedir source directory, followed by the path\n" |
| "\tsuffix suffix that is to be appended with a '-'\n" |
| "\t to the source file basenames before opening;\n" |
| "\t 'gennorm new' will read UnicodeData-new.txt etc.\n"); |
| return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; |
| } |
| |
| /* get the options values */ |
| beVerbose=options[2].doesOccur; |
| haveCopyright=options[3].doesOccur; |
| srcDir=options[5].value; |
| destDir=options[4].value; |
| |
| if(argc>=2) { |
| suffix=argv[1]; |
| } else { |
| suffix=NULL; |
| } |
| |
| setUnicodeVersion(options[6].value); |
| |
| /* prepare the filename beginning with the source dir */ |
| uprv_strcpy(filename, srcDir); |
| basename=filename+uprv_strlen(filename); |
| if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { |
| *basename++=U_FILE_SEP_CHAR; |
| } |
| |
| /* initialize */ |
| init(); |
| |
| /* process DerivedNormalizationProperties.txt (quick check flags) */ |
| if(suffix==NULL) { |
| uprv_strcpy(basename, "DerivedNormalizationProperties.txt"); |
| } else { |
| uprv_strcpy(basename, "DerivedNormalizationProperties"); |
| basename[30]='-'; |
| uprv_strcpy(basename+31, suffix); |
| uprv_strcat(basename+31, ".txt"); |
| } |
| parseDerivedNormalizationProperties(filename, &errorCode); |
| |
| /* process UnicodeData.txt */ |
| if(suffix==NULL) { |
| uprv_strcpy(basename, "UnicodeData.txt"); |
| } else { |
| uprv_strcpy(basename, "UnicodeData"); |
| basename[11]='-'; |
| uprv_strcpy(basename+12, suffix); |
| uprv_strcat(basename+12, ".txt"); |
| } |
| parseDB(filename, &errorCode); |
| |
| /* process parsed data */ |
| if(U_SUCCESS(errorCode)) { |
| processData(); |
| |
| /* write the properties data file */ |
| generateData(destDir); |
| |
| cleanUpData(); |
| } |
| |
| return errorCode; |
| } |
| |
| /* parsing helpers ---------------------------------------------------------- */ |
| |
| static const char * |
| skipWhitespace(const char *s) { |
| while(*s==' ' || *s=='\t') { |
| ++s; |
| } |
| return s; |
| } |
| |
| /* |
| * parse a list of code points |
| * store them as a UTF-32 string in dest[destCapacity] with the string length in dest[0] |
| * set the first code point in *pFirst |
| * return the number of code points |
| */ |
| static int32_t |
| parseCodePoints(const char *s, |
| uint32_t *dest, int32_t destCapacity, |
| UErrorCode *pErrorCode) { |
| char *end; |
| uint32_t value; |
| int32_t count; |
| |
| count=0; |
| for(;;) { |
| s=skipWhitespace(s); |
| if(*s==';' || *s==0) { |
| return count; |
| } |
| |
| /* read one code point */ |
| value=(uint32_t)uprv_strtoul(s, &end, 16); |
| if(end<=s || (*end!=' ' && *end!='\t' && *end!=';') || value>=0x110000) { |
| fprintf(stderr, "gennorm: syntax error parsing code point at %s\n", s); |
| *pErrorCode=U_PARSE_ERROR; |
| return -1; |
| } |
| |
| /* overflow? */ |
| if(count>=destCapacity) { |
| fprintf(stderr, "gennorm: code point sequence too long at at %s\n", s); |
| *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
| return -1; |
| } |
| |
| /* append it to the destination array */ |
| dest[count++]=value; |
| |
| /* go to the following characters */ |
| s=end; |
| } |
| } |
| |
| /* read a range like start or start..end */ |
| static int32_t |
| parseCodePointRange(const char *s, |
| uint32_t *pStart, uint32_t *pEnd, |
| UErrorCode *pErrorCode) { |
| char *end; |
| uint32_t value; |
| |
| s=skipWhitespace(s); |
| if(*s==';' || *s==0) { |
| fprintf(stderr, "gennorm: syntax error parsing range at %s - empty field\n", s); |
| *pErrorCode=U_PARSE_ERROR; |
| return -1; |
| } |
| |
| /* read the start code point */ |
| value=(uint32_t)uprv_strtoul(s, &end, 16); |
| if(end<=s || (*end!=' ' && *end!='\t' && *end!='.' && *end!=';') || value>=0x110000) { |
| fprintf(stderr, "gennorm: syntax error parsing range start code point at %s\n", s); |
| *pErrorCode=U_PARSE_ERROR; |
| return -1; |
| } |
| *pStart=*pEnd=value; |
| |
| /* is there a "..end"? */ |
| s=skipWhitespace(end); |
| if(*s==';' || *s==0) { |
| return 1; |
| } |
| |
| if(*s!='.' || s[1]!='.') { |
| fprintf(stderr, "gennorm: syntax error parsing range at %s\n", s); |
| *pErrorCode=U_PARSE_ERROR; |
| return -1; |
| } |
| s+=2; |
| |
| /* read the end code point */ |
| value=(uint32_t)uprv_strtoul(s, &end, 16); |
| if(end<=s || (*end!=' ' && *end!='\t' && *end!=';') || value>=0x110000) { |
| fprintf(stderr, "gennorm: syntax error parsing range end code point at %s\n", s); |
| *pErrorCode=U_PARSE_ERROR; |
| return -1; |
| } |
| *pEnd=value; |
| |
| /* is this a valid range? */ |
| if(value<*pStart) { |
| fprintf(stderr, "gennorm: syntax error parsing range at %s - not a valid range\n", s); |
| *pErrorCode=U_PARSE_ERROR; |
| return -1; |
| } |
| |
| /* no garbage after that? */ |
| s=skipWhitespace(end); |
| if(*s==';' || *s==0) { |
| return value-*pStart+1; |
| } else { |
| fprintf(stderr, "gennorm: syntax error parsing range at %s\n", s); |
| *pErrorCode=U_PARSE_ERROR; |
| return -1; |
| } |
| } |
| |
| /* parser for DerivedNormalizationProperties.txt ---------------------------- */ |
| |
| static void |
| derivedNormalizationPropertiesLineFn(void *context, |
| char *fields[][2], int32_t fieldCount, |
| UErrorCode *pErrorCode) { |
| char *s; |
| uint32_t start, end; |
| int32_t count; |
| uint8_t qcFlags; |
| |
| /* get code point range */ |
| count=parseCodePointRange(fields[0][0], &start, &end, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]); |
| exit(*pErrorCode); |
| } |
| |
| /* ignore hangul - handle explicitly */ |
| if(start==0xac00) { |
| return; |
| } |
| |
| /* get property - ignore unrecognized ones */ |
| s=(char *)skipWhitespace(fields[1][0]); |
| if(*s=='N' && s[1]=='F') { |
| qcFlags=0x11; |
| s+=2; |
| if(*s=='K') { |
| qcFlags<<=1; |
| ++s; |
| } |
| |
| if(*s=='C' && s[1]=='_') { |
| s+=2; |
| } else if(*s=='D' && s[1]=='_') { |
| qcFlags<<=2; |
| s+=2; |
| } else { |
| return; |
| } |
| |
| if(0==uprv_memcmp(s, "NO", 2)) { |
| qcFlags&=0xf; |
| } else if(0==uprv_memcmp(s, "MAYBE", 5)) { |
| qcFlags&=0x30; |
| } else { |
| return; |
| } |
| |
| /* set this flag for all code points in this range */ |
| while(start<=end) { |
| setQCFlags(start++, qcFlags); |
| } |
| } else if(0==uprv_memcmp(s, "Comp_Ex", 7)) { |
| while(start<=end) { |
| setCompositionExclusion(start++); |
| } |
| } |
| } |
| |
| static void |
| parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode) { |
| char *fields[2][2]; |
| |
| if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| return; |
| } |
| |
| u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode); |
| } |
| |
| /* parser for UnicodeData.txt ----------------------------------------------- */ |
| |
| static void |
| unicodeDataLineFn(void *context, |
| char *fields[][2], int32_t fieldCount, |
| UErrorCode *pErrorCode) { |
| uint32_t decomp[40]; |
| Norm norm; |
| const char *s; |
| char *end; |
| uint32_t code, value; |
| int32_t length; |
| UBool isCompat, something=FALSE; |
| |
| /* ignore First and Last entries for ranges */ |
| if( *fields[1][0]=='<' && |
| (length=(fields[1][1]-fields[1][0]))>=9 && |
| (0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) |
| ) { |
| return; |
| } |
| |
| /* reset the properties */ |
| uprv_memset(&norm, 0, sizeof(Norm)); |
| |
| /* get the character code, field 0 */ |
| code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); |
| if(end<=fields[0][0] || end!=fields[0][1]) { |
| fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]); |
| *pErrorCode=U_PARSE_ERROR; |
| exit(U_PARSE_ERROR); |
| } |
| |
| /* get canonical combining class, field 3 */ |
| value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10); |
| if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) { |
| fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]); |
| *pErrorCode=U_PARSE_ERROR; |
| exit(U_PARSE_ERROR); |
| } |
| if(value>0) { |
| norm.udataCC=(uint8_t)value; |
| something=TRUE; |
| } |
| |
| /* get the decomposition, field 5 */ |
| if(fields[5][0]<fields[5][1]) { |
| if(*(s=fields[5][0])=='<') { |
| ++s; |
| isCompat=TRUE; |
| |
| /* skip and ignore the compatibility type name */ |
| do { |
| if(s==fields[5][1]) { |
| /* missing '>' */ |
| fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]); |
| *pErrorCode=U_PARSE_ERROR; |
| exit(U_PARSE_ERROR); |
| } |
| } while(*s++!='>'); |
| } else { |
| isCompat=FALSE; |
| } |
| |
| /* parse the decomposition string */ |
| length=parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| exit(*pErrorCode); |
| } |
| |
| /* store the string */ |
| if(length>0) { |
| something=TRUE; |
| if(isCompat) { |
| norm.lenNFKD=(uint8_t)length; |
| norm.nfkd=decomp; |
| } else { |
| if(length>2) { |
| fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n", |
| (long)code, (long)length); |
| *pErrorCode=U_PARSE_ERROR; |
| exit(U_PARSE_ERROR); |
| } |
| norm.lenNFD=(uint8_t)length; |
| norm.nfd=decomp; |
| } |
| } |
| } |
| |
| /* check for non-character code points */ |
| if((code&0xfffe)==0xfffe || (uint32_t)(code-0xfdd0)<0x20 || code>0x10ffff) { |
| fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n", |
| (long)code); |
| *pErrorCode=U_PARSE_ERROR; |
| exit(U_PARSE_ERROR); |
| } |
| |
| if(something) { |
| /* there are normalization values, so store them */ |
| if(beVerbose) { |
| printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n", |
| (long)code, norm.udataCC, (long)norm.lenNFD, (long)norm.lenNFKD); |
| } |
| storeNorm(code, &norm); |
| } |
| } |
| |
| static void |
| parseDB(const char *filename, UErrorCode *pErrorCode) { |
| char *fields[15][2]; |
| |
| if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| return; |
| } |
| |
| u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode); |
| } |
| |
| /* |
| * Hey, Emacs, please set the following: |
| * |
| * Local Variables: |
| * indent-tabs-mode: nil |
| * End: |
| * |
| */ |