source/tools/genprops/genprops.c - external/github.com/unicode-org/icu - Git at Google

 /*
 *******************************************************************************
 *
 *   Copyright (C) 1999-2008, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
 *   file name:  genprops.c
 *   encoding:   US-ASCII
 *   tab size:   8 (not used)
 *   indentation:4
 *
 *   created on: 1999dec08
 *   created by: Markus W. Scherer
 *
 *   This program reads several of the Unicode character database text files,
 *   parses them, and extracts most of the properties for each character.
 *   It then writes a binary file containing the properties
 *   that is designed to be used directly for random-access to
 *   the properties of each Unicode character.
 */

 #include <stdio.h>
 #include <stdlib.h>
 #include "unicode/utypes.h"
 #include "unicode/uchar.h"
 #include "unicode/putil.h"
 #include "unicode/uclean.h"
 #include "cmemory.h"
 #include "cstring.h"
 #include "unewdata.h"
 #include "uoptions.h"
 #include "uparse.h"
 #include "uprops.h"
 #include "propsvec.h"

 U_CDECL_BEGIN
 #include "genprops.h"
 U_CDECL_END

 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))

 UBool beVerbose=FALSE, haveCopyright=TRUE;

 /* prototypes --------------------------------------------------------------- */

 static void
 parseDB(const char *filename, UErrorCode *pErrorCode);

 /* -------------------------------------------------------------------------- */

 enum
 {
     HELP_H,
     HELP_QUESTION_MARK,
     VERBOSE,
     COPYRIGHT,
     DESTDIR,
     SOURCEDIR,
     UNICODE_VERSION,
     ICUDATADIR,
     CSOURCE
 };

 /* Keep these values in sync with the above enums */
 static UOption options[]={
     UOPTION_HELP_H,
     UOPTION_HELP_QUESTION_MARK,
     UOPTION_VERBOSE,
     UOPTION_COPYRIGHT,
     UOPTION_DESTDIR,
     UOPTION_SOURCEDIR,
     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
     UOPTION_ICUDATADIR,
     UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
 };

 extern int
 main(int argc, char* argv[]) {
     char filename[300];
     const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
     char *basename=NULL;
     UErrorCode errorCode=U_ZERO_ERROR;

     U_MAIN_INIT_ARGS(argc, argv);

     /* preset then read command line options */
     options[DESTDIR].value=u_getDataDirectory();
     options[SOURCEDIR].value="";
     options[UNICODE_VERSION].value="";
     options[ICUDATADIR].value=u_getDataDirectory();
     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);

     /* error handling, printing usage message */
     if(argc<0) {
         fprintf(stderr,
             "error in command line argument \"%s\"\n",
             argv[-argc]);
     }
     if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
         /*
          * Broken into chucks because the C89 standard says the minimum
          * required supported string length is 509 bytes.
          */
         fprintf(stderr,
             "Usage: %s [-options] [suffix]\n"
             "\n"
             "read the UnicodeData.txt file and other Unicode properties files and\n"
             "create a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n"
             "\n",
             argv[0]);
         fprintf(stderr,
             "Options:\n"
             "\t-h or -? or --help  this usage text\n"
             "\t-v or --verbose     verbose output\n"
             "\t-c or --copyright   include a copyright notice\n"
             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
             "\t-C or --csource     generate a .c source file rather than the .icu binary\n");
         fprintf(stderr,
             "\t-d or --destdir     destination directory, followed by the path\n"
             "\t-s or --sourcedir   source directory, followed by the path\n"
             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
             "\t                    followed by path, defaults to %s\n"
             "\tsuffix              suffix that is to be appended with a '-'\n"
             "\t                    to the source file basenames before opening;\n"
             "\t                    'genprops new' will read UnicodeData-new.txt etc.\n",
             u_getDataDirectory());
         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
     }

     /* get the options values */
     beVerbose=options[VERBOSE].doesOccur;
     haveCopyright=options[COPYRIGHT].doesOccur;
     srcDir=options[SOURCEDIR].value;
     destDir=options[DESTDIR].value;

     if(argc>=2) {
         suffix=argv[1];
     } else {
         suffix=NULL;
     }

     if(options[UNICODE_VERSION].doesOccur) {
         setUnicodeVersion(options[UNICODE_VERSION].value);
     }
     /* else use the default dataVersion in store.c */

     if (options[ICUDATADIR].doesOccur) {
         u_setDataDirectory(options[ICUDATADIR].value);
     }

     /* prepare the filename beginning with the source dir */
     uprv_strcpy(filename, srcDir);
     basename=filename+uprv_strlen(filename);
     if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
         *basename++=U_FILE_SEP_CHAR;
     }

     /* initialize */
     initStore();

     /* process UnicodeData.txt */
     writeUCDFilename(basename, "UnicodeData", suffix);
     parseDB(filename, &errorCode);

     /* process additional properties files */
     *basename=0;
     generateAdditionalProperties(filename, suffix, &errorCode);

     /* process parsed data */
     if(U_SUCCESS(errorCode)) {
         /* write the properties data file */
         generateData(destDir, options[CSOURCE].doesOccur);
     }

     exitStore();
     u_cleanup();
     return errorCode;
 }

 U_CFUNC void
 writeUCDFilename(char *basename, const char *filename, const char *suffix) {
     int32_t length=(int32_t)uprv_strlen(filename);
     uprv_strcpy(basename, filename);
     if(suffix!=NULL) {
         basename[length++]='-';
         uprv_strcpy(basename+length, suffix);
         length+=(int32_t)uprv_strlen(suffix);
     }
     uprv_strcpy(basename+length, ".txt");
 }

 U_CFUNC UBool
 isToken(const char *token, const char *s) {
     const char *z;
     int32_t j;

     s=u_skipWhitespace(s);
     for(j=0;; ++j) {
         if(token[j]!=0) {
             if(s[j]!=token[j]) {
                 break;
             }
         } else {
             z=u_skipWhitespace(s+j);
             if(*z==';' || *z==0) {
                 return TRUE;
             } else {
                 break;
             }
         }
     }

     return FALSE;
 }

 U_CFUNC int32_t
 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
     const char *t, *z;
     int32_t i, j;

     s=u_skipWhitespace(s);
     for(i=0; i<countTokens; ++i) {
         t=tokens[i];
         if(t!=NULL) {
             for(j=0;; ++j) {
                 if(t[j]!=0) {
                     if(s[j]!=t[j]) {
                         break;
                     }
                 } else {
                     z=u_skipWhitespace(s+j);
                     if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
                         return i;
                     } else {
                         break;
                     }
                 }
             }
         }
     }
     return -1;
 }

 /* parser for UnicodeData.txt ----------------------------------------------- */

 /* general categories */
 const char *const
 genCategoryNames[U_CHAR_CATEGORY_COUNT]={
     "Cn",
     "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
     "Mc", "Nd", "Nl", "No",
     "Zs", "Zl", "Zp",
     "Cc", "Cf", "Co", "Cs",
     "Pd", "Ps", "Pe", "Pc", "Po",
     "Sm", "Sc", "Sk", "So",
     "Pi", "Pf"
 };

 const char *const
 decompositionTypeNames[U_DT_COUNT]={
     NULL,
     NULL,
     "compat",
     "circle",
     "final",
     "font",
     "fraction",
     "initial",
     "isolated",
     "medial",
     "narrow",
     "noBreak",
     "small",
     "square",
     "sub",
     "super",
     "vertical",
     "wide"
 };

 static struct {
     uint32_t first, last, props;
     char name[80];
 } unicodeAreas[32];

 static int32_t unicodeAreaIndex=0;

 static void U_CALLCONV
 unicodeDataLineFn(void *context,
                   char *fields[][2], int32_t fieldCount,
                   UErrorCode *pErrorCode) {
     Props p;
     char *end;
     static uint32_t prevCode=0;
     uint32_t value;
     int32_t i;

     /* reset the properties */
     uprv_memset(&p, 0, sizeof(Props));

     /* get the character code, field 0 */
     p.code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
     if(end<=fields[0][0] || end!=fields[0][1]) {
         fprintf(stderr, "genprops: syntax error in field 0 at %s\n", fields[0][0]);
         *pErrorCode=U_PARSE_ERROR;
         exit(U_PARSE_ERROR);
     }

     /* get general category, field 2 */
     i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
     if(i>=0) {
         p.generalCategory=(uint8_t)i;
     } else {
         fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n",
             fields[2][0], (unsigned long)p.code);
         *pErrorCode=U_PARSE_ERROR;
         exit(U_PARSE_ERROR);
     }

     /* get decomposition type, field 5 */
     if(fields[5][0]<fields[5][1]) {
         /* there is some decomposition */
         if(*fields[5][0]!='<') {
             /* canonical */
             i=U_DT_CANONICAL;
         } else {
             /* get compatibility type */
             end=fields[5][0]+1;
             while(end<fields[5][1] && *end!='>') {
                 ++end;
             }
             *end='#';
             i=getTokenIndex(decompositionTypeNames, U_DT_COUNT, fields[5][0]+1);
             if(i<0) {
                 fprintf(stderr, "genprops: unknown decomposition type \"%s\" at code 0x%lx\n",
                     fields[5][0], (unsigned long)p.code);
                 *pErrorCode=U_PARSE_ERROR;
                 exit(U_PARSE_ERROR);
             }
         }
         upvec_setValue(pv, p.code, p.code, 2, (uint32_t)i, UPROPS_DT_MASK, pErrorCode);
         if(U_FAILURE(*pErrorCode)) {
             fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(*pErrorCode));
             exit(*pErrorCode);
         }
     }

     /* decimal digit value, field 6 */
     if(fields[6][0]<fields[6][1]) {
         value=(uint32_t)uprv_strtoul(fields[6][0], &end, 10);
         if(end!=fields[6][1] || value>0x7fff) {
             fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n",
                 (unsigned long)p.code);
             *pErrorCode=U_PARSE_ERROR;
             exit(U_PARSE_ERROR);
         }
         p.numericValue=(int32_t)value;
         p.numericType=1;
     }

     /* digit value, field 7 */
     if(fields[7][0]<fields[7][1]) {
         value=(uint32_t)uprv_strtoul(fields[7][0], &end, 10);
         if(end!=fields[7][1] || value>0x7fff) {
             fprintf(stderr, "genprops: syntax error in field 7 at code 0x%lx\n",
                 (unsigned long)p.code);
             *pErrorCode=U_PARSE_ERROR;
             exit(U_PARSE_ERROR);
         }
         if(p.numericType==0) {
             p.numericValue=(int32_t)value;
             p.numericType=2;
         } else if((int32_t)value!=p.numericValue) {
             fprintf(stderr, "genprops error: numeric values in fields 6 & 7 different at code 0x%lx\n",
                 (unsigned long)p.code);
             *pErrorCode=U_PARSE_ERROR;
             exit(U_PARSE_ERROR);
         }
     }

     /* numeric value, field 8 */
     if(fields[8][0]<fields[8][1]) {
         char *s=fields[8][0];
         UBool isNegative;

         /* get a possible minus sign */
         if(*s=='-') {
             isNegative=TRUE;
             ++s;
         } else {
             isNegative=FALSE;
         }

         value=(uint32_t)uprv_strtoul(s, &end, 10);
         if(value>0 && *end=='/') {
             /* field 8 may contain a fractional value, get the denominator */
             if(p.numericType>0) {
                 fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
                     (unsigned long)p.code);
                 *pErrorCode=U_PARSE_ERROR;
                 exit(U_PARSE_ERROR);
             }

             p.denominator=(uint32_t)uprv_strtoul(end+1, &end, 10);
             if(p.denominator==0) {
                 fprintf(stderr, "genprops: denominator is 0 in field 8 at code 0x%lx\n",
                     (unsigned long)p.code);
                 *pErrorCode=U_PARSE_ERROR;
                 exit(U_PARSE_ERROR);
             }
         }
         if(end!=fields[8][1] || value>0x7fffffff) {
             fprintf(stderr, "genprops: syntax error in field 8 at code 0x%lx\n",
                 (unsigned long)p.code);
             *pErrorCode=U_PARSE_ERROR;
             exit(U_PARSE_ERROR);
         }

         if(p.numericType==0) {
             if(isNegative) {
                 p.numericValue=-(int32_t)value;
             } else {
                 p.numericValue=(int32_t)value;
             }
             p.numericType=3;
         } else if((int32_t)value!=p.numericValue) {
             fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
                 (unsigned long)p.code);
             *pErrorCode=U_PARSE_ERROR;
             exit(U_PARSE_ERROR);
         }
     }

     value=makeProps(&p);

     if(*fields[1][0]=='<') {
         /* first or last entry of a Unicode area */
         size_t length=fields[1][1]-fields[1][0];

         if(length<9) {
             /* name too short for an area name */
         } else if(0==uprv_memcmp(", First>", fields[1][1]-8, 8)) {
             /* set the current area */
             if(unicodeAreas[unicodeAreaIndex].first==0xffffffff) {
                 length-=9;
                 unicodeAreas[unicodeAreaIndex].first=p.code;
                 unicodeAreas[unicodeAreaIndex].props=value;
                 uprv_memcpy(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length);
                 unicodeAreas[unicodeAreaIndex].name[length]=0;
             } else {
                 /* error: a previous area is incomplete */
                 fprintf(stderr, "genprops: error - area \"%s\" is incomplete\n", unicodeAreas[unicodeAreaIndex].name);
                 *pErrorCode=U_PARSE_ERROR;
                 exit(U_PARSE_ERROR);
             }
             return;
         } else if(0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) {
             /* check that the current area matches, and complete it with the last code point */
             length-=8;
             if( unicodeAreas[unicodeAreaIndex].props==value &&
                 0==uprv_memcmp(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length) &&
                 unicodeAreas[unicodeAreaIndex].name[length]==0 &&
                 unicodeAreas[unicodeAreaIndex].first<p.code
             ) {
                 unicodeAreas[unicodeAreaIndex].last=p.code;
                 if(beVerbose) {
                     printf("Unicode area U+%04lx..U+%04lx \"%s\"\n",
                         (unsigned long)unicodeAreas[unicodeAreaIndex].first,
                         (unsigned long)unicodeAreas[unicodeAreaIndex].last,
                         unicodeAreas[unicodeAreaIndex].name);
                 }
                 unicodeAreas[++unicodeAreaIndex].first=0xffffffff;
             } else {
                 /* error: different properties between first & last, different area name, first>=last */
                 fprintf(stderr, "genprops: error - Last of area \"%s\" is incorrect\n", unicodeAreas[unicodeAreaIndex].name);
                 *pErrorCode=U_PARSE_ERROR;
                 exit(U_PARSE_ERROR);
             }
             return;
         } else {
             /* not an area name */
         }
     }

     /* check for non-character code points */
     if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
         fprintf(stderr, "genprops: error - properties for non-character code point U+%04lx\n",
                 (unsigned long)p.code);
         *pErrorCode=U_PARSE_ERROR;
         exit(U_PARSE_ERROR);
     }

     /* check that the code points (p.code) are in ascending order */
     if(p.code<=prevCode && p.code>0) {
         fprintf(stderr, "genprops: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
                 (unsigned long)p.code, (unsigned long)prevCode);
         *pErrorCode=U_PARSE_ERROR;
         exit(U_PARSE_ERROR);
     }
     prevCode=p.code;

     /* properties for a single code point */
     addProps(p.code, value);
 }

 /* set repeated properties for the areas */
 static void
 repeatAreaProps() {
     uint32_t puaProps;
     int32_t i;
     UBool hasPlane15PUA, hasPlane16PUA;
     UErrorCode errorCode;

     /*
      * UnicodeData.txt before 3.0.1 did not contain the PUAs on
      * planes 15 and 16.
      * If that is the case, then we add them here, using the properties
      * from the BMP PUA.
      */
     puaProps=0;
     hasPlane15PUA=hasPlane16PUA=FALSE;

     for(i=0; i<unicodeAreaIndex; ++i) {
         repeatProps(unicodeAreas[i].first,
                     unicodeAreas[i].last,
                     unicodeAreas[i].props);
         if(unicodeAreas[i].first==0xe000) {
             puaProps=unicodeAreas[i].props;
         } else if(unicodeAreas[i].first==0xf0000) {
             hasPlane15PUA=TRUE;
         } else if(unicodeAreas[i].first==0x100000) {
             hasPlane16PUA=TRUE;
         }
     }

     if(puaProps!=0) {
         if(!hasPlane15PUA) {
             repeatProps(0xf0000, 0xffffd, puaProps);
         }
         if(!hasPlane16PUA) {
             repeatProps(0x100000, 0x10fffd, puaProps);
         }
     }

     /* Hangul have canonical decompositions */
     errorCode=U_ZERO_ERROR;
     upvec_setValue(pv, 0xac00, 0xd7a3, 2, (uint32_t)U_DT_CANONICAL, UPROPS_DT_MASK, &errorCode);
     if(U_FAILURE(errorCode)) {
         fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(errorCode));
         exit(errorCode);
     }
 }

 static void
 parseDB(const char *filename, UErrorCode *pErrorCode) {
     char *fields[15][2];

     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
         return;
     }

     /* while unicodeAreas[unicodeAreaIndex] is unused, set its first to a bogus value */
     unicodeAreas[0].first=0xffffffff;

     u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);

     if(unicodeAreas[unicodeAreaIndex].first!=0xffffffff) {
         fprintf(stderr, "genprops: error - the last area \"%s\" from U+%04lx is incomplete\n",
             unicodeAreas[unicodeAreaIndex].name,
             (unsigned long)unicodeAreas[unicodeAreaIndex].first);
         *pErrorCode=U_PARSE_ERROR;
         exit(U_PARSE_ERROR);
     }

     repeatAreaProps();

     if(U_FAILURE(*pErrorCode)) {
         return;
     }
 }

 /*
  * Hey, Emacs, please set the following:
  *
  * Local Variables:
  * indent-tabs-mode: nil
  * End:
  *
  */
	/*
	*******************************************************************************
	*
	* Copyright (C) 1999-2008, International Business Machines
	* Corporation and others. All Rights Reserved.
	*
	*******************************************************************************
	* file name: genprops.c
	* encoding: US-ASCII
	* tab size: 8 (not used)
	* indentation:4
	*
	* created on: 1999dec08
	* created by: Markus W. Scherer
	*
	* This program reads several of the Unicode character database text files,
	* parses them, and extracts most of the properties for each character.
	* It then writes a binary file containing the properties
	* that is designed to be used directly for random-access to
	* the properties of each Unicode character.
	*/

	#include <stdio.h>
	#include <stdlib.h>
	#include "unicode/utypes.h"
	#include "unicode/uchar.h"
	#include "unicode/putil.h"
	#include "unicode/uclean.h"
	#include "cmemory.h"
	#include "cstring.h"
	#include "unewdata.h"
	#include "uoptions.h"
	#include "uparse.h"
	#include "uprops.h"
	#include "propsvec.h"

	U_CDECL_BEGIN
	#include "genprops.h"
	U_CDECL_END

	#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))

	UBool beVerbose=FALSE, haveCopyright=TRUE;

	/* prototypes --------------------------------------------------------------- */

	static void
	parseDB(const char filename, UErrorCode pErrorCode);

	/* -------------------------------------------------------------------------- */

	enum
	{
	HELP_H,
	HELP_QUESTION_MARK,
	VERBOSE,
	COPYRIGHT,
	DESTDIR,
	SOURCEDIR,
	UNICODE_VERSION,
	ICUDATADIR,
	CSOURCE
	};

	/* Keep these values in sync with the above enums */
	static UOption options[]={
	UOPTION_HELP_H,
	UOPTION_HELP_QUESTION_MARK,
	UOPTION_VERBOSE,
	UOPTION_COPYRIGHT,
	UOPTION_DESTDIR,
	UOPTION_SOURCEDIR,
	UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
	UOPTION_ICUDATADIR,
	UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
	};

	extern int
	main(int argc, char* argv[]) {
	char filename[300];
	const char srcDir=NULL, destDir=NULL, *suffix=NULL;
	char *basename=NULL;
	UErrorCode errorCode=U_ZERO_ERROR;

	U_MAIN_INIT_ARGS(argc, argv);

	/* preset then read command line options */
	options[DESTDIR].value=u_getDataDirectory();
	options[SOURCEDIR].value="";
	options[UNICODE_VERSION].value="";
	options[ICUDATADIR].value=u_getDataDirectory();
	argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);

	/* error handling, printing usage message */
	if(argc<0) {
	fprintf(stderr,
	"error in command line argument \"%s\"\n",
	argv[-argc]);
	}
	if(argc<0 \|\| options[HELP_H].doesOccur \|\| options[HELP_QUESTION_MARK].doesOccur) {
	/*
	* Broken into chucks because the C89 standard says the minimum
	* required supported string length is 509 bytes.
	*/
	fprintf(stderr,
	"Usage: %s [-options] [suffix]\n"
	"\n"
	"read the UnicodeData.txt file and other Unicode properties files and\n"
	"create a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n"
	"\n",
	argv[0]);
	fprintf(stderr,
	"Options:\n"
	"\t-h or -? or --help this usage text\n"
	"\t-v or --verbose verbose output\n"
	"\t-c or --copyright include a copyright notice\n"
	"\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
	"\t-C or --csource generate a .c source file rather than the .icu binary\n");
	fprintf(stderr,
	"\t-d or --destdir destination directory, followed by the path\n"
	"\t-s or --sourcedir source directory, followed by the path\n"
	"\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
	"\t followed by path, defaults to %s\n"
	"\tsuffix suffix that is to be appended with a '-'\n"
	"\t to the source file basenames before opening;\n"
	"\t 'genprops new' will read UnicodeData-new.txt etc.\n",
	u_getDataDirectory());
	return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
	}

	/* get the options values */
	beVerbose=options[VERBOSE].doesOccur;
	haveCopyright=options[COPYRIGHT].doesOccur;
	srcDir=options[SOURCEDIR].value;
	destDir=options[DESTDIR].value;

	if(argc>=2) {
	suffix=argv[1];
	} else {
	suffix=NULL;
	}

	if(options[UNICODE_VERSION].doesOccur) {
	setUnicodeVersion(options[UNICODE_VERSION].value);
	}
	/* else use the default dataVersion in store.c */

	if (options[ICUDATADIR].doesOccur) {
	u_setDataDirectory(options[ICUDATADIR].value);
	}

	/* prepare the filename beginning with the source dir */
	uprv_strcpy(filename, srcDir);
	basename=filename+uprv_strlen(filename);
	if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
	*basename++=U_FILE_SEP_CHAR;
	}

	/* initialize */
	initStore();

	/* process UnicodeData.txt */
	writeUCDFilename(basename, "UnicodeData", suffix);
	parseDB(filename, &errorCode);

	/* process additional properties files */
	*basename=0;
	generateAdditionalProperties(filename, suffix, &errorCode);

	/* process parsed data */
	if(U_SUCCESS(errorCode)) {
	/* write the properties data file */
	generateData(destDir, options[CSOURCE].doesOccur);
	}

	exitStore();
	u_cleanup();
	return errorCode;
	}

	U_CFUNC void
	writeUCDFilename(char basename, const char filename, const char *suffix) {
	int32_t length=(int32_t)uprv_strlen(filename);
	uprv_strcpy(basename, filename);
	if(suffix!=NULL) {
	basename[length++]='-';
	uprv_strcpy(basename+length, suffix);
	length+=(int32_t)uprv_strlen(suffix);
	}
	uprv_strcpy(basename+length, ".txt");
	}

	U_CFUNC UBool
	isToken(const char token, const char s) {
	const char *z;
	int32_t j;

	s=u_skipWhitespace(s);
	for(j=0;; ++j) {
	if(token[j]!=0) {
	if(s[j]!=token[j]) {
	break;
	}
	} else {
	z=u_skipWhitespace(s+j);
	if(z==';' \|\| z==0) {
	return TRUE;
	} else {
	break;
	}
	}
	}

	return FALSE;
	}

	U_CFUNC int32_t
	getTokenIndex(const char const tokens[], int32_t countTokens, const char s) {
	const char t, z;
	int32_t i, j;

	s=u_skipWhitespace(s);
	for(i=0; i<countTokens; ++i) {
	t=tokens[i];
	if(t!=NULL) {
	for(j=0;; ++j) {
	if(t[j]!=0) {
	if(s[j]!=t[j]) {
	break;
	}
	} else {
	z=u_skipWhitespace(s+j);
	if(z==';' \|\| z==0 \|\| z=='#' \|\| z=='\r' \|\| *z=='\n') {
	return i;
	} else {
	break;
	}
	}
	}
	}
	}
	return -1;
	}

	/* parser for UnicodeData.txt ----------------------------------------------- */

	/* general categories */
	const char *const
	genCategoryNames[U_CHAR_CATEGORY_COUNT]={
	"Cn",
	"Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
	"Mc", "Nd", "Nl", "No",
	"Zs", "Zl", "Zp",
	"Cc", "Cf", "Co", "Cs",
	"Pd", "Ps", "Pe", "Pc", "Po",
	"Sm", "Sc", "Sk", "So",
	"Pi", "Pf"
	};

	const char *const
	decompositionTypeNames[U_DT_COUNT]={
	NULL,
	NULL,
	"compat",
	"circle",
	"final",
	"font",
	"fraction",
	"initial",
	"isolated",
	"medial",
	"narrow",
	"noBreak",
	"small",
	"square",
	"sub",
	"super",
	"vertical",
	"wide"
	};

	static struct {
	uint32_t first, last, props;
	char name[80];
	} unicodeAreas[32];

	static int32_t unicodeAreaIndex=0;

	static void U_CALLCONV
	unicodeDataLineFn(void *context,
	char *fields[][2], int32_t fieldCount,
	UErrorCode *pErrorCode) {
	Props p;
	char *end;
	static uint32_t prevCode=0;
	uint32_t value;
	int32_t i;

	/* reset the properties */
	uprv_memset(&p, 0, sizeof(Props));

	/* get the character code, field 0 */
	p.code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
	if(end<=fields[0][0] \|\| end!=fields[0][1]) {
	fprintf(stderr, "genprops: syntax error in field 0 at %s\n", fields[0][0]);
	*pErrorCode=U_PARSE_ERROR;
	exit(U_PARSE_ERROR);
	}

	/* get general category, field 2 */
	i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
	if(i>=0) {
	p.generalCategory=(uint8_t)i;
	} else {
	fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n",
	fields[2][0], (unsigned long)p.code);
	*pErrorCode=U_PARSE_ERROR;
	exit(U_PARSE_ERROR);
	}

	/* get decomposition type, field 5 */
	if(fields[5][0]<fields[5][1]) {
	/* there is some decomposition */
	if(*fields[5][0]!='<') {
	/* canonical */
	i=U_DT_CANONICAL;
	} else {
	/* get compatibility type */
	end=fields[5][0]+1;
	while(end<fields[5][1] && *end!='>') {
	++end;
	}
	*end='#';
	i=getTokenIndex(decompositionTypeNames, U_DT_COUNT, fields[5][0]+1);
	if(i<0) {
	fprintf(stderr, "genprops: unknown decomposition type \"%s\" at code 0x%lx\n",
	fields[5][0], (unsigned long)p.code);
	*pErrorCode=U_PARSE_ERROR;
	exit(U_PARSE_ERROR);
	}
	}
	upvec_setValue(pv, p.code, p.code, 2, (uint32_t)i, UPROPS_DT_MASK, pErrorCode);
	if(U_FAILURE(*pErrorCode)) {
	fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(*pErrorCode));
	exit(*pErrorCode);
	}
	}

	/* decimal digit value, field 6 */
	if(fields[6][0]<fields[6][1]) {
	value=(uint32_t)uprv_strtoul(fields[6][0], &end, 10);
	if(end!=fields[6][1] \|\| value>0x7fff) {
	fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n",
	(unsigned long)p.code);
	*pErrorCode=U_PARSE_ERROR;
	exit(U_PARSE_ERROR);
	}
	p.numericValue=(int32_t)value;
	p.numericType=1;
	}

	/* digit value, field 7 */
	if(fields[7][0]<fields[7][1]) {
	value=(uint32_t)uprv_strtoul(fields[7][0], &end, 10);
	if(end!=fields[7][1] \|\| value>0x7fff) {
	fprintf(stderr, "genprops: syntax error in field 7 at code 0x%lx\n",
	(unsigned long)p.code);
	*pErrorCode=U_PARSE_ERROR;
	exit(U_PARSE_ERROR);
	}
	if(p.numericType==0) {
	p.numericValue=(int32_t)value;
	p.numericType=2;
	} else if((int32_t)value!=p.numericValue) {
	fprintf(stderr, "genprops error: numeric values in fields 6 & 7 different at code 0x%lx\n",
	(unsigned long)p.code);
	*pErrorCode=U_PARSE_ERROR;
	exit(U_PARSE_ERROR);
	}
	}

	/* numeric value, field 8 */
	if(fields[8][0]<fields[8][1]) {
	char *s=fields[8][0];
	UBool isNegative;

	/* get a possible minus sign */
	if(*s=='-') {
	isNegative=TRUE;
	++s;
	} else {
	isNegative=FALSE;
	}

	value=(uint32_t)uprv_strtoul(s, &end, 10);
	if(value>0 && *end=='/') {
	/* field 8 may contain a fractional value, get the denominator */
	if(p.numericType>0) {
	fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
	(unsigned long)p.code);
	*pErrorCode=U_PARSE_ERROR;
	exit(U_PARSE_ERROR);
	}

	p.denominator=(uint32_t)uprv_strtoul(end+1, &end, 10);
	if(p.denominator==0) {
	fprintf(stderr, "genprops: denominator is 0 in field 8 at code 0x%lx\n",
	(unsigned long)p.code);
	*pErrorCode=U_PARSE_ERROR;
	exit(U_PARSE_ERROR);
	}
	}
	if(end!=fields[8][1] \|\| value>0x7fffffff) {
	fprintf(stderr, "genprops: syntax error in field 8 at code 0x%lx\n",
	(unsigned long)p.code);
	*pErrorCode=U_PARSE_ERROR;
	exit(U_PARSE_ERROR);
	}

	if(p.numericType==0) {
	if(isNegative) {
	p.numericValue=-(int32_t)value;
	} else {
	p.numericValue=(int32_t)value;
	}
	p.numericType=3;
	} else if((int32_t)value!=p.numericValue) {
	fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
	(unsigned long)p.code);
	*pErrorCode=U_PARSE_ERROR;
	exit(U_PARSE_ERROR);
	}
	}

	value=makeProps(&p);

	if(*fields[1][0]=='<') {
	/* first or last entry of a Unicode area */
	size_t length=fields[1][1]-fields[1][0];

	if(length<9) {
	/* name too short for an area name */
	} else if(0==uprv_memcmp(", First>", fields[1][1]-8, 8)) {
	/* set the current area */
	if(unicodeAreas[unicodeAreaIndex].first==0xffffffff) {
	length-=9;
	unicodeAreas[unicodeAreaIndex].first=p.code;
	unicodeAreas[unicodeAreaIndex].props=value;
	uprv_memcpy(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length);
	unicodeAreas[unicodeAreaIndex].name[length]=0;
	} else {
	/* error: a previous area is incomplete */
	fprintf(stderr, "genprops: error - area \"%s\" is incomplete\n", unicodeAreas[unicodeAreaIndex].name);
	*pErrorCode=U_PARSE_ERROR;
	exit(U_PARSE_ERROR);
	}
	return;
	} else if(0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) {
	/* check that the current area matches, and complete it with the last code point */
	length-=8;
	if( unicodeAreas[unicodeAreaIndex].props==value &&
	0==uprv_memcmp(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length) &&
	unicodeAreas[unicodeAreaIndex].name[length]==0 &&
	unicodeAreas[unicodeAreaIndex].first<p.code
	) {
	unicodeAreas[unicodeAreaIndex].last=p.code;
	if(beVerbose) {
	printf("Unicode area U+%04lx..U+%04lx \"%s\"\n",
	(unsigned long)unicodeAreas[unicodeAreaIndex].first,
	(unsigned long)unicodeAreas[unicodeAreaIndex].last,
	unicodeAreas[unicodeAreaIndex].name);
	}
	unicodeAreas[++unicodeAreaIndex].first=0xffffffff;
	} else {
	/* error: different properties between first & last, different area name, first>=last */
	fprintf(stderr, "genprops: error - Last of area \"%s\" is incorrect\n", unicodeAreas[unicodeAreaIndex].name);
	*pErrorCode=U_PARSE_ERROR;
	exit(U_PARSE_ERROR);
	}
	return;
	} else {
	/* not an area name */
	}
	}

	/* check for non-character code points */
	if((p.code&0xfffe)==0xfffe \|\| (uint32_t)(p.code-0xfdd0)<0x20) {
	fprintf(stderr, "genprops: error - properties for non-character code point U+%04lx\n",
	(unsigned long)p.code);
	*pErrorCode=U_PARSE_ERROR;
	exit(U_PARSE_ERROR);
	}

	/* check that the code points (p.code) are in ascending order */
	if(p.code<=prevCode && p.code>0) {
	fprintf(stderr, "genprops: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
	(unsigned long)p.code, (unsigned long)prevCode);
	*pErrorCode=U_PARSE_ERROR;
	exit(U_PARSE_ERROR);
	}
	prevCode=p.code;

	/* properties for a single code point */
	addProps(p.code, value);
	}

	/* set repeated properties for the areas */
	static void
	repeatAreaProps() {
	uint32_t puaProps;
	int32_t i;
	UBool hasPlane15PUA, hasPlane16PUA;
	UErrorCode errorCode;

	/*
	* UnicodeData.txt before 3.0.1 did not contain the PUAs on
	* planes 15 and 16.
	* If that is the case, then we add them here, using the properties
	* from the BMP PUA.
	*/
	puaProps=0;
	hasPlane15PUA=hasPlane16PUA=FALSE;

	for(i=0; i<unicodeAreaIndex; ++i) {
	repeatProps(unicodeAreas[i].first,
	unicodeAreas[i].last,
	unicodeAreas[i].props);
	if(unicodeAreas[i].first==0xe000) {
	puaProps=unicodeAreas[i].props;
	} else if(unicodeAreas[i].first==0xf0000) {
	hasPlane15PUA=TRUE;
	} else if(unicodeAreas[i].first==0x100000) {
	hasPlane16PUA=TRUE;
	}
	}

	if(puaProps!=0) {
	if(!hasPlane15PUA) {
	repeatProps(0xf0000, 0xffffd, puaProps);
	}
	if(!hasPlane16PUA) {
	repeatProps(0x100000, 0x10fffd, puaProps);
	}
	}

	/* Hangul have canonical decompositions */
	errorCode=U_ZERO_ERROR;
	upvec_setValue(pv, 0xac00, 0xd7a3, 2, (uint32_t)U_DT_CANONICAL, UPROPS_DT_MASK, &errorCode);
	if(U_FAILURE(errorCode)) {
	fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(errorCode));
	exit(errorCode);
	}
	}

	static void
	parseDB(const char filename, UErrorCode pErrorCode) {
	char *fields[15][2];

	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
	return;
	}

	/* while unicodeAreas[unicodeAreaIndex] is unused, set its first to a bogus value */
	unicodeAreas[0].first=0xffffffff;

	u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);

	if(unicodeAreas[unicodeAreaIndex].first!=0xffffffff) {
	fprintf(stderr, "genprops: error - the last area \"%s\" from U+%04lx is incomplete\n",
	unicodeAreas[unicodeAreaIndex].name,
	(unsigned long)unicodeAreas[unicodeAreaIndex].first);
	*pErrorCode=U_PARSE_ERROR;
	exit(U_PARSE_ERROR);
	}

	repeatAreaProps();

	if(U_FAILURE(*pErrorCode)) {
	return;
	}
	}

	/*
	* Hey, Emacs, please set the following:
	*
	* Local Variables:
	* indent-tabs-mode: nil
	* End:
	*
	*/