blob: 119ed006d06e7c5d01e3a52b70e3633c7bd56c70 [file] [log] [blame]
/*
*******************************************************************************
*
* Copyright (C) 1999-2008, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: genprops.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999dec08
* created by: Markus W. Scherer
*
* This program reads several of the Unicode character database text files,
* parses them, and extracts most of the properties for each character.
* It then writes a binary file containing the properties
* that is designed to be used directly for random-access to
* the properties of each Unicode character.
*/
#include <stdio.h>
#include <stdlib.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/putil.h"
#include "unicode/uclean.h"
#include "cmemory.h"
#include "cstring.h"
#include "unewdata.h"
#include "uoptions.h"
#include "uparse.h"
#include "uprops.h"
#include "propsvec.h"
U_CDECL_BEGIN
#include "genprops.h"
U_CDECL_END
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
UBool beVerbose=FALSE, haveCopyright=TRUE;
/* prototypes --------------------------------------------------------------- */
static void
parseDB(const char *filename, UErrorCode *pErrorCode);
/* -------------------------------------------------------------------------- */
enum
{
HELP_H,
HELP_QUESTION_MARK,
VERBOSE,
COPYRIGHT,
DESTDIR,
SOURCEDIR,
UNICODE_VERSION,
ICUDATADIR,
CSOURCE
};
/* Keep these values in sync with the above enums */
static UOption options[]={
UOPTION_HELP_H,
UOPTION_HELP_QUESTION_MARK,
UOPTION_VERBOSE,
UOPTION_COPYRIGHT,
UOPTION_DESTDIR,
UOPTION_SOURCEDIR,
UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
UOPTION_ICUDATADIR,
UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
};
extern int
main(int argc, char* argv[]) {
char filename[300];
const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
char *basename=NULL;
UErrorCode errorCode=U_ZERO_ERROR;
U_MAIN_INIT_ARGS(argc, argv);
/* preset then read command line options */
options[DESTDIR].value=u_getDataDirectory();
options[SOURCEDIR].value="";
options[UNICODE_VERSION].value="";
options[ICUDATADIR].value=u_getDataDirectory();
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
/* error handling, printing usage message */
if(argc<0) {
fprintf(stderr,
"error in command line argument \"%s\"\n",
argv[-argc]);
}
if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
/*
* Broken into chucks because the C89 standard says the minimum
* required supported string length is 509 bytes.
*/
fprintf(stderr,
"Usage: %s [-options] [suffix]\n"
"\n"
"read the UnicodeData.txt file and other Unicode properties files and\n"
"create a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n"
"\n",
argv[0]);
fprintf(stderr,
"Options:\n"
"\t-h or -? or --help this usage text\n"
"\t-v or --verbose verbose output\n"
"\t-c or --copyright include a copyright notice\n"
"\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
"\t-C or --csource generate a .c source file rather than the .icu binary\n");
fprintf(stderr,
"\t-d or --destdir destination directory, followed by the path\n"
"\t-s or --sourcedir source directory, followed by the path\n"
"\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
"\t followed by path, defaults to %s\n"
"\tsuffix suffix that is to be appended with a '-'\n"
"\t to the source file basenames before opening;\n"
"\t 'genprops new' will read UnicodeData-new.txt etc.\n",
u_getDataDirectory());
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
}
/* get the options values */
beVerbose=options[VERBOSE].doesOccur;
haveCopyright=options[COPYRIGHT].doesOccur;
srcDir=options[SOURCEDIR].value;
destDir=options[DESTDIR].value;
if(argc>=2) {
suffix=argv[1];
} else {
suffix=NULL;
}
if(options[UNICODE_VERSION].doesOccur) {
setUnicodeVersion(options[UNICODE_VERSION].value);
}
/* else use the default dataVersion in store.c */
if (options[ICUDATADIR].doesOccur) {
u_setDataDirectory(options[ICUDATADIR].value);
}
/* prepare the filename beginning with the source dir */
uprv_strcpy(filename, srcDir);
basename=filename+uprv_strlen(filename);
if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
*basename++=U_FILE_SEP_CHAR;
}
/* initialize */
initStore();
/* process UnicodeData.txt */
writeUCDFilename(basename, "UnicodeData", suffix);
parseDB(filename, &errorCode);
/* process additional properties files */
*basename=0;
generateAdditionalProperties(filename, suffix, &errorCode);
/* process parsed data */
if(U_SUCCESS(errorCode)) {
/* write the properties data file */
generateData(destDir, options[CSOURCE].doesOccur);
}
exitStore();
u_cleanup();
return errorCode;
}
U_CFUNC void
writeUCDFilename(char *basename, const char *filename, const char *suffix) {
int32_t length=(int32_t)uprv_strlen(filename);
uprv_strcpy(basename, filename);
if(suffix!=NULL) {
basename[length++]='-';
uprv_strcpy(basename+length, suffix);
length+=(int32_t)uprv_strlen(suffix);
}
uprv_strcpy(basename+length, ".txt");
}
U_CFUNC UBool
isToken(const char *token, const char *s) {
const char *z;
int32_t j;
s=u_skipWhitespace(s);
for(j=0;; ++j) {
if(token[j]!=0) {
if(s[j]!=token[j]) {
break;
}
} else {
z=u_skipWhitespace(s+j);
if(*z==';' || *z==0) {
return TRUE;
} else {
break;
}
}
}
return FALSE;
}
U_CFUNC int32_t
getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
const char *t, *z;
int32_t i, j;
s=u_skipWhitespace(s);
for(i=0; i<countTokens; ++i) {
t=tokens[i];
if(t!=NULL) {
for(j=0;; ++j) {
if(t[j]!=0) {
if(s[j]!=t[j]) {
break;
}
} else {
z=u_skipWhitespace(s+j);
if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
return i;
} else {
break;
}
}
}
}
}
return -1;
}
/* parser for UnicodeData.txt ----------------------------------------------- */
/* general categories */
const char *const
genCategoryNames[U_CHAR_CATEGORY_COUNT]={
"Cn",
"Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
"Mc", "Nd", "Nl", "No",
"Zs", "Zl", "Zp",
"Cc", "Cf", "Co", "Cs",
"Pd", "Ps", "Pe", "Pc", "Po",
"Sm", "Sc", "Sk", "So",
"Pi", "Pf"
};
const char *const
decompositionTypeNames[U_DT_COUNT]={
NULL,
NULL,
"compat",
"circle",
"final",
"font",
"fraction",
"initial",
"isolated",
"medial",
"narrow",
"noBreak",
"small",
"square",
"sub",
"super",
"vertical",
"wide"
};
static struct {
uint32_t first, last, props;
char name[80];
} unicodeAreas[32];
static int32_t unicodeAreaIndex=0;
static void U_CALLCONV
unicodeDataLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
Props p;
char *end;
static uint32_t prevCode=0;
uint32_t value;
int32_t i;
/* reset the properties */
uprv_memset(&p, 0, sizeof(Props));
/* get the character code, field 0 */
p.code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
if(end<=fields[0][0] || end!=fields[0][1]) {
fprintf(stderr, "genprops: syntax error in field 0 at %s\n", fields[0][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* get general category, field 2 */
i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
if(i>=0) {
p.generalCategory=(uint8_t)i;
} else {
fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n",
fields[2][0], (unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* get decomposition type, field 5 */
if(fields[5][0]<fields[5][1]) {
/* there is some decomposition */
if(*fields[5][0]!='<') {
/* canonical */
i=U_DT_CANONICAL;
} else {
/* get compatibility type */
end=fields[5][0]+1;
while(end<fields[5][1] && *end!='>') {
++end;
}
*end='#';
i=getTokenIndex(decompositionTypeNames, U_DT_COUNT, fields[5][0]+1);
if(i<0) {
fprintf(stderr, "genprops: unknown decomposition type \"%s\" at code 0x%lx\n",
fields[5][0], (unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
}
upvec_setValue(pv, p.code, p.code, 2, (uint32_t)i, UPROPS_DT_MASK, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(*pErrorCode));
exit(*pErrorCode);
}
}
/* decimal digit value, field 6 */
if(fields[6][0]<fields[6][1]) {
value=(uint32_t)uprv_strtoul(fields[6][0], &end, 10);
if(end!=fields[6][1] || value>0x7fff) {
fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
p.numericValue=(int32_t)value;
p.numericType=1;
}
/* digit value, field 7 */
if(fields[7][0]<fields[7][1]) {
value=(uint32_t)uprv_strtoul(fields[7][0], &end, 10);
if(end!=fields[7][1] || value>0x7fff) {
fprintf(stderr, "genprops: syntax error in field 7 at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(p.numericType==0) {
p.numericValue=(int32_t)value;
p.numericType=2;
} else if((int32_t)value!=p.numericValue) {
fprintf(stderr, "genprops error: numeric values in fields 6 & 7 different at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
}
/* numeric value, field 8 */
if(fields[8][0]<fields[8][1]) {
char *s=fields[8][0];
UBool isNegative;
/* get a possible minus sign */
if(*s=='-') {
isNegative=TRUE;
++s;
} else {
isNegative=FALSE;
}
value=(uint32_t)uprv_strtoul(s, &end, 10);
if(value>0 && *end=='/') {
/* field 8 may contain a fractional value, get the denominator */
if(p.numericType>0) {
fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
p.denominator=(uint32_t)uprv_strtoul(end+1, &end, 10);
if(p.denominator==0) {
fprintf(stderr, "genprops: denominator is 0 in field 8 at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
}
if(end!=fields[8][1] || value>0x7fffffff) {
fprintf(stderr, "genprops: syntax error in field 8 at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(p.numericType==0) {
if(isNegative) {
p.numericValue=-(int32_t)value;
} else {
p.numericValue=(int32_t)value;
}
p.numericType=3;
} else if((int32_t)value!=p.numericValue) {
fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
}
value=makeProps(&p);
if(*fields[1][0]=='<') {
/* first or last entry of a Unicode area */
size_t length=fields[1][1]-fields[1][0];
if(length<9) {
/* name too short for an area name */
} else if(0==uprv_memcmp(", First>", fields[1][1]-8, 8)) {
/* set the current area */
if(unicodeAreas[unicodeAreaIndex].first==0xffffffff) {
length-=9;
unicodeAreas[unicodeAreaIndex].first=p.code;
unicodeAreas[unicodeAreaIndex].props=value;
uprv_memcpy(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length);
unicodeAreas[unicodeAreaIndex].name[length]=0;
} else {
/* error: a previous area is incomplete */
fprintf(stderr, "genprops: error - area \"%s\" is incomplete\n", unicodeAreas[unicodeAreaIndex].name);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
return;
} else if(0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) {
/* check that the current area matches, and complete it with the last code point */
length-=8;
if( unicodeAreas[unicodeAreaIndex].props==value &&
0==uprv_memcmp(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length) &&
unicodeAreas[unicodeAreaIndex].name[length]==0 &&
unicodeAreas[unicodeAreaIndex].first<p.code
) {
unicodeAreas[unicodeAreaIndex].last=p.code;
if(beVerbose) {
printf("Unicode area U+%04lx..U+%04lx \"%s\"\n",
(unsigned long)unicodeAreas[unicodeAreaIndex].first,
(unsigned long)unicodeAreas[unicodeAreaIndex].last,
unicodeAreas[unicodeAreaIndex].name);
}
unicodeAreas[++unicodeAreaIndex].first=0xffffffff;
} else {
/* error: different properties between first & last, different area name, first>=last */
fprintf(stderr, "genprops: error - Last of area \"%s\" is incorrect\n", unicodeAreas[unicodeAreaIndex].name);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
return;
} else {
/* not an area name */
}
}
/* check for non-character code points */
if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
fprintf(stderr, "genprops: error - properties for non-character code point U+%04lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* check that the code points (p.code) are in ascending order */
if(p.code<=prevCode && p.code>0) {
fprintf(stderr, "genprops: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
(unsigned long)p.code, (unsigned long)prevCode);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
prevCode=p.code;
/* properties for a single code point */
addProps(p.code, value);
}
/* set repeated properties for the areas */
static void
repeatAreaProps() {
uint32_t puaProps;
int32_t i;
UBool hasPlane15PUA, hasPlane16PUA;
UErrorCode errorCode;
/*
* UnicodeData.txt before 3.0.1 did not contain the PUAs on
* planes 15 and 16.
* If that is the case, then we add them here, using the properties
* from the BMP PUA.
*/
puaProps=0;
hasPlane15PUA=hasPlane16PUA=FALSE;
for(i=0; i<unicodeAreaIndex; ++i) {
repeatProps(unicodeAreas[i].first,
unicodeAreas[i].last,
unicodeAreas[i].props);
if(unicodeAreas[i].first==0xe000) {
puaProps=unicodeAreas[i].props;
} else if(unicodeAreas[i].first==0xf0000) {
hasPlane15PUA=TRUE;
} else if(unicodeAreas[i].first==0x100000) {
hasPlane16PUA=TRUE;
}
}
if(puaProps!=0) {
if(!hasPlane15PUA) {
repeatProps(0xf0000, 0xffffd, puaProps);
}
if(!hasPlane16PUA) {
repeatProps(0x100000, 0x10fffd, puaProps);
}
}
/* Hangul have canonical decompositions */
errorCode=U_ZERO_ERROR;
upvec_setValue(pv, 0xac00, 0xd7a3, 2, (uint32_t)U_DT_CANONICAL, UPROPS_DT_MASK, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(errorCode));
exit(errorCode);
}
}
static void
parseDB(const char *filename, UErrorCode *pErrorCode) {
char *fields[15][2];
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
/* while unicodeAreas[unicodeAreaIndex] is unused, set its first to a bogus value */
unicodeAreas[0].first=0xffffffff;
u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
if(unicodeAreas[unicodeAreaIndex].first!=0xffffffff) {
fprintf(stderr, "genprops: error - the last area \"%s\" from U+%04lx is incomplete\n",
unicodeAreas[unicodeAreaIndex].name,
(unsigned long)unicodeAreas[unicodeAreaIndex].first);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
repeatAreaProps();
if(U_FAILURE(*pErrorCode)) {
return;
}
}
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/