|  | /* | 
|  | ******************************************************************************* | 
|  | * | 
|  | *   Copyright (C) 1999-2001, International Business Machines | 
|  | *   Corporation and others.  All Rights Reserved. | 
|  | * | 
|  | ******************************************************************************* | 
|  | *   file name:  gennames.c | 
|  | *   encoding:   US-ASCII | 
|  | *   tab size:   8 (not used) | 
|  | *   indentation:4 | 
|  | * | 
|  | *   created on: 1999sep30 | 
|  | *   created by: Markus W. Scherer | 
|  | * | 
|  | *   This program reads the Unicode character database text file, | 
|  | *   parses it, and extracts the character code, | 
|  | *   the "modern" character name, and optionally the | 
|  | *   Unicode 1.0 character name. | 
|  | *   It then tokenizes and compresses the names and builds | 
|  | *   compact binary tables for random-access lookup | 
|  | *   in a u_charName() API function. | 
|  | * | 
|  | * unames.dat file format (after UDataInfo header etc. - see udata.c) | 
|  | * (all data is static const) | 
|  | * | 
|  | * UDataInfo fields: | 
|  | *   dataFormat "unam" | 
|  | *   formatVersion 1.0 | 
|  | *   dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0 | 
|  | * | 
|  | * -- data-based names | 
|  | * uint32_t tokenStringOffset, | 
|  | *          groupsOffset, | 
|  | *          groupStringOffset, | 
|  | *          algNamesOffset; | 
|  | * | 
|  | * uint16_t tokenCount; | 
|  | * uint16_t tokenTable[tokenCount]; | 
|  | * | 
|  | * char     tokenStrings[]; -- padded to even count | 
|  | * | 
|  | * -- strings (groupStrings) are tokenized as follows: | 
|  | *   for each character c | 
|  | *       if(c>=tokenCount) write that character c directly | 
|  | *   else | 
|  | *       token=tokenTable[c]; | 
|  | *       if(token==0xfffe) -- lead byte of double-byte token | 
|  | *           token=tokenTable[c<<8|next character]; | 
|  | *       if(token==-1) | 
|  | *           write c directly | 
|  | *       else | 
|  | *           tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;) | 
|  | *           append zero-terminated tokenString; | 
|  | * | 
|  | * uint16_t groupCount; | 
|  | * struct { | 
|  | *   uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5 | 
|  | *   uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset | 
|  | *   uint16_t offsetLow; | 
|  | * } groupTable[groupCount]; | 
|  | * | 
|  | * char     groupStrings[]; -- padded to 4-count | 
|  | * | 
|  | * -- The actual, tokenized group strings are not zero-terminated because | 
|  | *   that would take up too much space. | 
|  | *   Instead, they are preceeded by their length, written in a variable-length sequence: | 
|  | *   For each of the 32 group strings, one or two nibbles are stored for its length. | 
|  | *   Nibbles (4-bit values, half-bytes) are read MSB first. | 
|  | *   A nibble with a value of 0..11 directly indicates the length of the name string. | 
|  | *   A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m | 
|  | *   by (((n-12)<<4)|m)+12, reaching values of 12..75. | 
|  | *   These lengths are sequentially for each tokenized string, not for the de-tokenized result. | 
|  | *   For the de-tokenizing, see token description above; the strings immediately follow the | 
|  | *   32 lengths. | 
|  | * | 
|  | * -- algorithmic names | 
|  | * | 
|  | * typedef struct AlgorithmicRange { | 
|  | *     uint32_t rangeStart, rangeEnd; | 
|  | *     uint8_t algorithmType, algorithmVariant; | 
|  | *     uint16_t rangeSize; | 
|  | * } AlgorithmicRange; | 
|  | * | 
|  | * uint32_t algRangesCount; -- number of data blocks for ranges of | 
|  | *               algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames) | 
|  | * | 
|  | * struct { | 
|  | *     AlgorithmicRange algRange; | 
|  | *     uint8_t algRangeData[]; -- padded to 4-count except in last range | 
|  | * } algRanges[algNamesCount]; | 
|  | * -- not a real array because each part has a different size | 
|  | *    of algRange.rangeSize (including AlgorithmicRange) | 
|  | * | 
|  | * -- algorithmic range types: | 
|  | * | 
|  | * 0 Names are formed from a string prefix that is stored in | 
|  | *   the algRangeData (zero-terminated), followed by the Unicode code point | 
|  | *   of the character in hexadecimal digits; | 
|  | *   algRange.algorithmVariant digits are written | 
|  | * | 
|  | * 1 Names are formed by calculating modulo-factors of the code point value as follows: | 
|  | *   algRange.algorithmVariant is the count of modulo factors | 
|  | *   algRangeData contains | 
|  | *       uint16_t factors[algRange.algorithmVariant]; | 
|  | *       char strings[]; | 
|  | *   the first zero-terminated string is written as the prefix; then: | 
|  | * | 
|  | *   The rangeStart is subtracted; with the difference, here "code": | 
|  | *   for(i=algRange.algorithmVariant-1 to 0 step -1) | 
|  | *       index[i]=code%factor[i]; | 
|  | *       code/=factor[i]; | 
|  | * | 
|  | *   The strings after the prefix are short pieces that are then appended to the result | 
|  | *   according to index[0..algRange.algorithmVariant-1]. | 
|  | */ | 
|  |  | 
|  | #include <stdio.h> | 
|  | #include <stdlib.h> | 
|  | #include "unicode/utypes.h" | 
|  | #include "unicode/putil.h" | 
|  | #include "cmemory.h" | 
|  | #include "cstring.h" | 
|  | #include "unicode/udata.h" | 
|  | #include "unewdata.h" | 
|  | #include "uoptions.h" | 
|  | #include "uparse.h" | 
|  |  | 
|  | #define STRING_STORE_SIZE 1000000 | 
|  | #define GROUP_STORE_SIZE 5000 | 
|  |  | 
|  | #define GROUP_SHIFT 5 | 
|  | #define LINES_PER_GROUP (1UL<<GROUP_SHIFT) | 
|  | #define GROUP_MASK (LINES_PER_GROUP-1) | 
|  |  | 
|  | #define MAX_LINE_COUNT 50000 | 
|  | #define MAX_WORD_COUNT 20000 | 
|  | #define MAX_GROUP_COUNT 5000 | 
|  |  | 
|  | #define DATA_NAME "unames" | 
|  | #define DATA_TYPE "dat" | 
|  | #define VERSION_STRING "unam" | 
|  | #define NAME_SEPARATOR_CHAR ';' | 
|  |  | 
|  | static const UVersionInfo | 
|  | unicode_3_0={ 3, 0, 0, 0 }, | 
|  | unicode_3_1={ 3, 1, 0, 0 }; | 
|  |  | 
|  | /* UDataInfo cf. udata.h */ | 
|  | static UDataInfo dataInfo={ | 
|  | sizeof(UDataInfo), | 
|  | 0, | 
|  |  | 
|  | U_IS_BIG_ENDIAN, | 
|  | U_CHARSET_FAMILY, | 
|  | sizeof(UChar), | 
|  | 0, | 
|  |  | 
|  | {0x75, 0x6e, 0x61, 0x6d},     /* dataFormat="unam" */ | 
|  | {1, 0, 0, 0},                 /* formatVersion */ | 
|  | {3, 0, 0, 0}                  /* dataVersion */ | 
|  | }; | 
|  |  | 
|  | static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE; | 
|  |  | 
|  | static uint8_t stringStore[STRING_STORE_SIZE], | 
|  | groupStore[GROUP_STORE_SIZE], | 
|  | lineLengths[LINES_PER_GROUP]; | 
|  |  | 
|  | static uint32_t lineTop=0, wordBottom=STRING_STORE_SIZE, lineLengthsTop; | 
|  |  | 
|  | typedef struct { | 
|  | uint32_t code; | 
|  | int16_t length; | 
|  | uint8_t *s; | 
|  | } Line; | 
|  |  | 
|  | typedef struct { | 
|  | int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */ | 
|  | int16_t count; | 
|  | int16_t length; | 
|  | uint8_t *s; | 
|  | } Word; | 
|  |  | 
|  | static Line lines[MAX_LINE_COUNT]; | 
|  | static Word words[MAX_WORD_COUNT]; | 
|  |  | 
|  | static uint32_t lineCount=0, wordCount=0; | 
|  |  | 
|  | static int16_t leadByteCount; | 
|  |  | 
|  | #define LEADBYTE_LIMIT 16 | 
|  |  | 
|  | static int16_t tokens[LEADBYTE_LIMIT*256]; | 
|  | static uint32_t tokenCount; | 
|  |  | 
|  | /* prototypes --------------------------------------------------------------- */ | 
|  |  | 
|  | static void | 
|  | init(void); | 
|  |  | 
|  | static void | 
|  | parseDB(const char *filename, UBool store10Names); | 
|  |  | 
|  | static void | 
|  | parseName(char *name, int16_t length); | 
|  |  | 
|  | static int16_t | 
|  | skipNoise(char *line, int16_t start, int16_t limit); | 
|  |  | 
|  | static int16_t | 
|  | getWord(char *line, int16_t start, int16_t limit); | 
|  |  | 
|  | static void | 
|  | compress(void); | 
|  |  | 
|  | static void | 
|  | compressLines(void); | 
|  |  | 
|  | static int16_t | 
|  | compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop); | 
|  |  | 
|  | static int | 
|  | compareWords(const void *word1, const void *word2); | 
|  |  | 
|  | static void | 
|  | generateData(const char *dataDir); | 
|  |  | 
|  | static uint32_t | 
|  | generateAlgorithmicData(UNewDataMemory *pData); | 
|  |  | 
|  | static int16_t | 
|  | findToken(uint8_t *s, int16_t length); | 
|  |  | 
|  | static Word * | 
|  | findWord(char *s, int16_t length); | 
|  |  | 
|  | static Word * | 
|  | addWord(char *s, int16_t length); | 
|  |  | 
|  | static void | 
|  | countWord(Word *word); | 
|  |  | 
|  | static void | 
|  | addLine(uint32_t code, char *name1, int16_t name1Length, char *name2, int16_t name2Length); | 
|  |  | 
|  | static void | 
|  | addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length); | 
|  |  | 
|  | static uint32_t | 
|  | addToken(uint8_t *s, int16_t length); | 
|  |  | 
|  | static void | 
|  | appendLineLength(int16_t length); | 
|  |  | 
|  | static void | 
|  | appendLineLengthNibble(uint8_t nibble); | 
|  |  | 
|  | static uint8_t * | 
|  | allocLine(uint32_t length); | 
|  |  | 
|  | static uint8_t * | 
|  | allocWord(uint32_t length); | 
|  |  | 
|  | /* -------------------------------------------------------------------------- */ | 
|  |  | 
|  | static UOption options[]={ | 
|  | UOPTION_HELP_H, | 
|  | UOPTION_HELP_QUESTION_MARK, | 
|  | UOPTION_VERBOSE, | 
|  | UOPTION_QUIET, | 
|  | UOPTION_COPYRIGHT, | 
|  | UOPTION_DESTDIR, | 
|  | { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }, | 
|  | { "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 } | 
|  | }; | 
|  |  | 
|  | extern int | 
|  | main(int argc, char* argv[]) { | 
|  | UVersionInfo version; | 
|  | UBool store10Names=FALSE; | 
|  |  | 
|  | /* preset then read command line options */ | 
|  | options[5].value=u_getDataDirectory(); | 
|  | options[6].value="3.1.1"; | 
|  | argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); | 
|  |  | 
|  | /* error handling, printing usage message */ | 
|  | if(argc<0) { | 
|  | fprintf(stderr, | 
|  | "error in command line argument \"%s\"\n", | 
|  | argv[-argc]); | 
|  | } else if(argc<2) { | 
|  | argc=-1; | 
|  | } | 
|  | if(argc<0 || options[0].doesOccur || options[1].doesOccur) { | 
|  | /* | 
|  | * Broken into chucks because the C89 standard says the minimum | 
|  | * required supported string length is 509 bytes. | 
|  | */ | 
|  | fprintf(stderr, | 
|  | "Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] filename\n" | 
|  | "\n" | 
|  | "Read the UnicodeData.txt file and \n" | 
|  | "create a binary file " DATA_NAME "." DATA_TYPE " with the character names\n" | 
|  | "\n" | 
|  | "\tfilename  absolute path/filename for the Unicode database text file\n" | 
|  | "\t\t(default: standard input)\n" | 
|  | "\n", | 
|  | argv[0]); | 
|  | fprintf(stderr, | 
|  | "Options:\n" | 
|  | "\t-h or -? or --help  this usage text\n" | 
|  | "\t-v or --verbose     verbose output\n" | 
|  | "\t-q or --quiet       no output\n" | 
|  | "\t-c or --copyright   include a copyright notice\n" | 
|  | "\t-d or --destdir     destination directory, followed by the path\n" | 
|  | "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n" | 
|  | "\t-1 or --unicode1-names  store Unicode 1.0 character names\n"); | 
|  | return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; | 
|  | } | 
|  |  | 
|  | /* get the options values */ | 
|  | beVerbose=options[2].doesOccur; | 
|  | beQuiet=options[3].doesOccur; | 
|  | haveCopyright=options[4].doesOccur; | 
|  | store10Names=options[7].doesOccur; | 
|  |  | 
|  | /* set the Unicode version */ | 
|  | u_versionFromString(version, options[6].value); | 
|  | uprv_memcpy(dataInfo.dataVersion, version, 4); | 
|  |  | 
|  | init(); | 
|  | parseDB(argc>=2 ? argv[1] : "-", store10Names); | 
|  | compress(); | 
|  | generateData(options[5].value); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static void | 
|  | init() { | 
|  | int i; | 
|  |  | 
|  | for(i=0; i<256; ++i) { | 
|  | tokens[i]=0; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* parsing ------------------------------------------------------------------ */ | 
|  |  | 
|  | static void | 
|  | lineFn(void *context, | 
|  | char *fields[][2], int32_t fieldCount, | 
|  | UErrorCode *pErrorCode) { | 
|  | static uint32_t prevCode=0; | 
|  | uint32_t code=0; | 
|  | char *name1Start, *name2Start; | 
|  | int16_t name1Length, name2Length; | 
|  |  | 
|  | if(U_FAILURE(*pErrorCode)) { | 
|  | return; | 
|  | } | 
|  | /* get the character code */ | 
|  | code=uprv_strtoul(fields[0][0], NULL, 16); | 
|  |  | 
|  | /* get the character name */ | 
|  | name1Start=fields[1][0]; | 
|  | if(fields[1][0][0]!='<') { | 
|  | name1Length=(int16_t)(fields[1][1]-name1Start); | 
|  | } else { | 
|  | /* do not store pseudo-names in <> brackets */ | 
|  | name1Length=0; | 
|  | } | 
|  |  | 
|  | /* store 1.0 names */ | 
|  | /* get the second character name, the one from Unicode 1.0 */ | 
|  | /* do not store pseudo-names in <> brackets */ | 
|  | name2Start=fields[10][0]; | 
|  | if(*(UBool *)context && fields[10][0][0]!='<') { | 
|  | name2Length=(int16_t)(fields[10][1]-name2Start); | 
|  | } else { | 
|  | name2Length=0; | 
|  | } | 
|  |  | 
|  | if(name1Length+name2Length==0) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | /* check for non-character code points */ | 
|  | if((code&0xfffe)==0xfffe || (uint32_t)(code-0xfdd0)<0x20) { | 
|  | fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n", | 
|  | (unsigned long)code); | 
|  | *pErrorCode=U_PARSE_ERROR; | 
|  | exit(U_PARSE_ERROR); | 
|  | } | 
|  |  | 
|  | /* check that the code points (code) are in ascending order */ | 
|  | if(code<=prevCode && code>0) { | 
|  | fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", | 
|  | (unsigned long)code, (unsigned long)prevCode); | 
|  | *pErrorCode=U_PARSE_ERROR; | 
|  | exit(U_PARSE_ERROR); | 
|  | } | 
|  | prevCode=code; | 
|  |  | 
|  | /* printf("%lx:%.*s(%.*s)\n", code, name1Length, line+name1Start, name2Length, line+name2Start); */ | 
|  |  | 
|  | parseName(name1Start, name1Length); | 
|  | parseName(name2Start, name2Length); | 
|  |  | 
|  | addLine(code, name1Start, name1Length, name2Start, name2Length); | 
|  | } | 
|  |  | 
|  | static void | 
|  | parseDB(const char *filename, UBool store10Names) { | 
|  | char *fields[11][2]; | 
|  | UErrorCode errorCode=U_ZERO_ERROR; | 
|  |  | 
|  | /* parsing the 11 fields 0..10 is enough for gennames */ | 
|  | u_parseDelimitedFile(filename, ';', fields, 11, lineFn, &store10Names, &errorCode); | 
|  | if(U_FAILURE(errorCode)) { | 
|  | exit(errorCode); | 
|  | } | 
|  |  | 
|  | if(!beQuiet) { | 
|  | printf("size of all names in the database: %lu\n", | 
|  | (unsigned long)lineTop); | 
|  | printf("number of named Unicode characters: %lu\n", | 
|  | (unsigned long)lineCount); | 
|  | printf("number of words in the dictionary from these names: %lu\n", | 
|  | (unsigned long)wordCount); | 
|  | } | 
|  | } | 
|  |  | 
|  | static void | 
|  | parseName(char *name, int16_t length) { | 
|  | int16_t start=0, limit, wordLength/*, prevStart=-1*/; | 
|  | Word *word; | 
|  |  | 
|  | while(start<length) { | 
|  | /* skip any "noise" characters */ | 
|  | limit=skipNoise(name, start, length); | 
|  | if(start<limit) { | 
|  | /*prevStart=-1;*/ | 
|  | start=limit; | 
|  | } | 
|  | if(start==length) { | 
|  | break; | 
|  | } | 
|  |  | 
|  | /* get a word and add it if it is longer than 1 */ | 
|  | limit=getWord(name, start, length); | 
|  | wordLength=(int16_t)(limit-start); | 
|  | if(wordLength>1) { | 
|  | word=findWord(name+start, wordLength); | 
|  | if(word==NULL) { | 
|  | word=addWord(name+start, wordLength); | 
|  | } | 
|  | countWord(word); | 
|  | } | 
|  |  | 
|  | #if 0 | 
|  | /* | 
|  | * if there was a word before this | 
|  | * (with no noise in between), then add the pair of words, too | 
|  | */ | 
|  | if(prevStart!=-1) { | 
|  | wordLength=limit-prevStart; | 
|  | word=findWord(name+prevStart, wordLength); | 
|  | if(word==NULL) { | 
|  | word=addWord(name+prevStart, wordLength); | 
|  | } | 
|  | countWord(word); | 
|  | } | 
|  | #endif | 
|  |  | 
|  | /*prevStart=start;*/ | 
|  | start=limit; | 
|  | } | 
|  | } | 
|  |  | 
|  | static int16_t | 
|  | skipNoise(char *line, int16_t start, int16_t limit) { | 
|  | char c; | 
|  |  | 
|  | /* skip anything that is not part of a word in this sense */ | 
|  | while(start<limit && !(('A'<=(c=line[start]) && c<='Z') || ('0'<=c && c<='9'))) { | 
|  | ++start; | 
|  | } | 
|  |  | 
|  | return start; | 
|  | } | 
|  |  | 
|  | static int16_t | 
|  | getWord(char *line, int16_t start, int16_t limit) { | 
|  | char c=0; /* initialize to avoid a compiler warning although the code was safe */ | 
|  |  | 
|  | /* a unicode character name word consists of A-Z0-9 */ | 
|  | while(start<limit && (('A'<=(c=line[start]) && c<='Z') || ('0'<=c && c<='9'))) { | 
|  | ++start; | 
|  | } | 
|  |  | 
|  | /* include a following space or dash */ | 
|  | if(start<limit && (c==' ' || c=='-')) { | 
|  | ++start; | 
|  | } | 
|  |  | 
|  | return start; | 
|  | } | 
|  |  | 
|  | /* compressing -------------------------------------------------------------- */ | 
|  |  | 
|  | static void | 
|  | compress() { | 
|  | uint32_t i, letterCount; | 
|  | int16_t wordNumber; | 
|  |  | 
|  | /* sort the words in reverse order by weight */ | 
|  | qsort(words, wordCount, sizeof(Word), compareWords); | 
|  |  | 
|  | /* remove the words that do not save anything */ | 
|  | while(wordCount>0 && words[wordCount-1].weight<1) { | 
|  | --wordCount; | 
|  | } | 
|  |  | 
|  | /* count the letters in the token range */ | 
|  | letterCount=0; | 
|  | for(i=LEADBYTE_LIMIT; i<256; ++i) { | 
|  | if(tokens[i]==-1) { | 
|  | ++letterCount; | 
|  | } | 
|  | } | 
|  | if(!beQuiet) { | 
|  | printf("number of letters used in the names: %d\n", letterCount); | 
|  | } | 
|  |  | 
|  | /* do we need double-byte tokens? */ | 
|  | if(wordCount+letterCount<=256) { | 
|  | /* no, single-byte tokens are enough */ | 
|  | leadByteCount=0; | 
|  | for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) { | 
|  | if(tokens[i]!=-1) { | 
|  | tokens[i]=wordNumber; | 
|  | if(beVerbose) { | 
|  | printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", | 
|  | i, (long)words[wordNumber].weight, | 
|  | words[wordNumber].length, words[wordNumber].s); | 
|  | } | 
|  | ++wordNumber; | 
|  | } | 
|  | } | 
|  | tokenCount=i; | 
|  | } else { | 
|  | /* | 
|  | * The tokens that need two token bytes | 
|  | * get their weight reduced by their count | 
|  | * because they save less. | 
|  | */ | 
|  | tokenCount=256-letterCount; | 
|  | for(i=tokenCount; i<wordCount; ++i) { | 
|  | words[i].weight-=words[i].count; | 
|  | } | 
|  |  | 
|  | /* sort these words in reverse order by weight */ | 
|  | qsort(words+tokenCount, wordCount-tokenCount, sizeof(Word), compareWords); | 
|  |  | 
|  | /* remove the words that do not save anything */ | 
|  | while(wordCount>0 && words[wordCount-1].weight<1) { | 
|  | --wordCount; | 
|  | } | 
|  |  | 
|  | /* how many tokens and lead bytes do we have now? */ | 
|  | tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1); | 
|  | leadByteCount=(int16_t)(tokenCount>>8); | 
|  | if(leadByteCount<LEADBYTE_LIMIT) { | 
|  | /* adjust for the real number of lead bytes */ | 
|  | tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount; | 
|  | } else { | 
|  | /* limit the number of lead bytes */ | 
|  | leadByteCount=LEADBYTE_LIMIT-1; | 
|  | tokenCount=LEADBYTE_LIMIT*256; | 
|  | wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1); | 
|  | } | 
|  |  | 
|  | /* set token 0 to word 0 */ | 
|  | tokens[0]=0; | 
|  | if(beVerbose) { | 
|  | printf("tokens[0x000]: word%8ld \"%.*s\"\n", | 
|  | (long)words[0].weight, | 
|  | words[0].length, words[0].s); | 
|  | } | 
|  | wordNumber=1; | 
|  |  | 
|  | /* set the lead byte tokens */ | 
|  | for(i=1; (int16_t)i<=leadByteCount; ++i) { | 
|  | tokens[i]=-2; | 
|  | } | 
|  |  | 
|  | /* set the tokens */ | 
|  | for(; i<256; ++i) { | 
|  | if(tokens[i]!=-1) { | 
|  | tokens[i]=wordNumber; | 
|  | if(beVerbose) { | 
|  | printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", | 
|  | i, (long)words[wordNumber].weight, | 
|  | words[wordNumber].length, words[wordNumber].s); | 
|  | } | 
|  | ++wordNumber; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* continue above 255 where there are no letters */ | 
|  | for(; i<tokenCount; ++i) { | 
|  | tokens[i]=wordNumber; | 
|  | if(beVerbose) { | 
|  | printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", | 
|  | i, (long)words[wordNumber].weight, | 
|  | words[wordNumber].length, words[wordNumber].s); | 
|  | } | 
|  | ++wordNumber; | 
|  | } | 
|  | } | 
|  |  | 
|  | if(!beQuiet) { | 
|  | printf("number of lead bytes: %d\n", leadByteCount); | 
|  | printf("number of single-byte tokens: %lu\n", | 
|  | (unsigned long)256-letterCount-leadByteCount); | 
|  | printf("number of tokens: %lu\n", (unsigned long)tokenCount); | 
|  | } | 
|  |  | 
|  | compressLines(); | 
|  | } | 
|  |  | 
|  | static void | 
|  | compressLines() { | 
|  | Line *line=NULL; | 
|  | uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */, | 
|  | groupMSB=0xffff, lineCount2; | 
|  | int16_t groupTop=0; | 
|  |  | 
|  | /* store the groups like lines, reusing the lines' memory */ | 
|  | lineTop=0; | 
|  | lineCount2=lineCount; | 
|  | lineCount=0; | 
|  |  | 
|  | /* loop over all lines */ | 
|  | while(i<lineCount2) { | 
|  | line=lines+i++; | 
|  | inLine=line->code; | 
|  |  | 
|  | /* segment the lines to groups of 32 */ | 
|  | if(inLine>>GROUP_SHIFT!=groupMSB) { | 
|  | /* finish the current group with empty lines */ | 
|  | while((++outLine&GROUP_MASK)!=0) { | 
|  | appendLineLength(0); | 
|  | } | 
|  |  | 
|  | /* store the group like a line */ | 
|  | if(groupTop>0) { | 
|  | if(groupTop>GROUP_STORE_SIZE) { | 
|  | fprintf(stderr, "gennames: group store overflow\n"); | 
|  | exit(U_BUFFER_OVERFLOW_ERROR); | 
|  | } | 
|  | addGroup(groupMSB, groupStore, groupTop); | 
|  | if(lineTop>(uint32_t)(line->s-stringStore)) { | 
|  | fprintf(stderr, "gennames: group store runs into string store\n"); | 
|  | exit(U_INTERNAL_PROGRAM_ERROR); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* start the new group */ | 
|  | lineLengthsTop=0; | 
|  | groupTop=0; | 
|  | groupMSB=inLine>>GROUP_SHIFT; | 
|  | outLine=(inLine&~GROUP_MASK)-1; | 
|  | } | 
|  |  | 
|  | /* write empty lines between the previous line in the group and this one */ | 
|  | while(++outLine<inLine) { | 
|  | appendLineLength(0); | 
|  | } | 
|  |  | 
|  | /* write characters and tokens for this line */ | 
|  | appendLineLength(compressLine(line->s, line->length, &groupTop)); | 
|  | } | 
|  |  | 
|  | /* finish and store the last group */ | 
|  | if(line && groupMSB!=0xffff) { | 
|  | /* finish the current group with empty lines */ | 
|  | while((++outLine&GROUP_MASK)!=0) { | 
|  | appendLineLength(0); | 
|  | } | 
|  |  | 
|  | /* store the group like a line */ | 
|  | if(groupTop>0) { | 
|  | if(groupTop>GROUP_STORE_SIZE) { | 
|  | fprintf(stderr, "gennames: group store overflow\n"); | 
|  | exit(U_BUFFER_OVERFLOW_ERROR); | 
|  | } | 
|  | addGroup(groupMSB, groupStore, groupTop); | 
|  | if(lineTop>(uint32_t)(line->s-stringStore)) { | 
|  | fprintf(stderr, "gennames: group store runs into string store\n"); | 
|  | exit(U_INTERNAL_PROGRAM_ERROR); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | if(!beQuiet) { | 
|  | printf("number of groups: %lu\n", (unsigned long)lineCount); | 
|  | } | 
|  | } | 
|  |  | 
|  | static int16_t | 
|  | compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) { | 
|  | int16_t start, limit, token, groupTop=*pGroupTop; | 
|  |  | 
|  | start=0; | 
|  | do { | 
|  | /* write any "noise" characters */ | 
|  | limit=skipNoise((char *)s, start, length); | 
|  | while(start<limit) { | 
|  | groupStore[groupTop++]=s[start++]; | 
|  | } | 
|  |  | 
|  | if(start==length) { | 
|  | break; | 
|  | } | 
|  |  | 
|  | /* write a word, as token or directly */ | 
|  | limit=getWord((char *)s, start, length); | 
|  | if(limit-start==1) { | 
|  | groupStore[groupTop++]=s[start++]; | 
|  | } else { | 
|  | token=findToken(s+start, (int16_t)(limit-start)); | 
|  | if(token!=-1) { | 
|  | if(token>0xff) { | 
|  | groupStore[groupTop++]=(uint8_t)(token>>8); | 
|  | } | 
|  | groupStore[groupTop++]=(uint8_t)token; | 
|  | start=limit; | 
|  | } else { | 
|  | while(start<limit) { | 
|  | groupStore[groupTop++]=s[start++]; | 
|  | } | 
|  | } | 
|  | } | 
|  | } while(start<length); | 
|  |  | 
|  | length=(int16_t)(groupTop-*pGroupTop); | 
|  | *pGroupTop=groupTop; | 
|  | return length; | 
|  | } | 
|  |  | 
|  | static int | 
|  | compareWords(const void *word1, const void *word2) { | 
|  | /* reverse sort by word weight */ | 
|  | return ((Word *)word2)->weight-((Word *)word1)->weight; | 
|  | } | 
|  |  | 
|  | /* generate output data ----------------------------------------------------- */ | 
|  |  | 
|  | static void | 
|  | generateData(const char *dataDir) { | 
|  | UNewDataMemory *pData; | 
|  | UErrorCode errorCode=U_ZERO_ERROR; | 
|  | uint16_t groupWords[3]; | 
|  | uint32_t i, groupTop=lineTop, offset, size, | 
|  | tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset; | 
|  | long dataLength; | 
|  | int16_t token; | 
|  |  | 
|  | pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo, | 
|  | haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); | 
|  | if(U_FAILURE(errorCode)) { | 
|  | fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode); | 
|  | exit(errorCode); | 
|  | } | 
|  |  | 
|  | /* first, see how much space we need, and prepare the token strings */ | 
|  | for(i=0; i<tokenCount; ++i) { | 
|  | token=tokens[i]; | 
|  | if(token!=-1 && token!=-2) { | 
|  | tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Calculate the total size in bytes of the data including: | 
|  | * - the offset to the token strings, uint32_t (4) | 
|  | * - the offset to the group table, uint32_t (4) | 
|  | * - the offset to the group strings, uint32_t (4) | 
|  | * - the offset to the algorithmic names, uint32_t (4) | 
|  | * | 
|  | * - the number of tokens, uint16_t (2) | 
|  | * - the token table, uint16_t[tokenCount] (2*tokenCount) | 
|  | * | 
|  | * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded | 
|  | * | 
|  | * - the number of groups, uint16_t (2) | 
|  | * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount] | 
|  | * | 
|  | * - the group strings (groupTop), 2-padded | 
|  | * | 
|  | * - the size of the data for the algorithmic names | 
|  | */ | 
|  | tokenStringOffset=4+4+4+4+2+2*tokenCount; | 
|  | groupsOffset=(tokenStringOffset+(lineTop-groupTop+1))&~1; | 
|  | groupStringOffset=groupsOffset+2+6*lineCount; | 
|  | algNamesOffset=(groupStringOffset+groupTop+3)&~3; | 
|  |  | 
|  | offset=generateAlgorithmicData(NULL); | 
|  | size=algNamesOffset+offset; | 
|  |  | 
|  | if(!beQuiet) { | 
|  | printf("size of the Unicode Names data:\n" | 
|  | "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n", | 
|  | (unsigned long)size, (unsigned long)(lineTop-groupTop), | 
|  | (unsigned long)groupTop, (unsigned long)offset); | 
|  | } | 
|  |  | 
|  | /* write the data to the file */ | 
|  | /* offsets */ | 
|  | udata_write32(pData, tokenStringOffset); | 
|  | udata_write32(pData, groupsOffset); | 
|  | udata_write32(pData, groupStringOffset); | 
|  | udata_write32(pData, algNamesOffset); | 
|  |  | 
|  | /* token table */ | 
|  | udata_write16(pData, (uint16_t)tokenCount); | 
|  | udata_writeBlock(pData, tokens, 2*tokenCount); | 
|  |  | 
|  | /* token strings */ | 
|  | udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop); | 
|  | if((lineTop-groupTop)&1) { | 
|  | /* 2-padding */ | 
|  | udata_writePadding(pData, 1); | 
|  | } | 
|  |  | 
|  | /* group table */ | 
|  | udata_write16(pData, (uint16_t)lineCount); | 
|  | for(i=0; i<lineCount; ++i) { | 
|  | /* groupMSB */ | 
|  | groupWords[0]=(uint16_t)lines[i].code; | 
|  |  | 
|  | /* offset */ | 
|  | offset=lines[i].s-stringStore; | 
|  | groupWords[1]=(uint16_t)(offset>>16); | 
|  | groupWords[2]=(uint16_t)(offset); | 
|  | udata_writeBlock(pData, groupWords, 6); | 
|  | } | 
|  |  | 
|  | /* group strings */ | 
|  | udata_writeBlock(pData, stringStore, groupTop); | 
|  |  | 
|  | /* 4-align the algorithmic names data */ | 
|  | udata_writePadding(pData, algNamesOffset-(groupStringOffset+groupTop)); | 
|  |  | 
|  | generateAlgorithmicData(pData); | 
|  |  | 
|  | /* finish up */ | 
|  | dataLength=udata_finish(pData, &errorCode); | 
|  | if(U_FAILURE(errorCode)) { | 
|  | fprintf(stderr, "gennames: error %d writing the output file\n", errorCode); | 
|  | exit(errorCode); | 
|  | } | 
|  |  | 
|  | if(dataLength!=(long)size) { | 
|  | fprintf(stderr, "gennames: data length %ld != calculated size %lu\n", | 
|  | dataLength, (unsigned long)size); | 
|  | exit(U_INTERNAL_PROGRAM_ERROR); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* the structure for algorithmic names needs to be 4-aligned */ | 
|  | typedef struct AlgorithmicRange { | 
|  | uint32_t rangeStart, rangeEnd; | 
|  | uint8_t algorithmType, algorithmVariant; | 
|  | uint16_t rangeSize; | 
|  | } AlgorithmicRange; | 
|  |  | 
|  | static uint32_t | 
|  | generateAlgorithmicData(UNewDataMemory *pData) { | 
|  | static char prefix[] = "CJK UNIFIED IDEOGRAPH-"; | 
|  | #   define PREFIX_LENGTH 23 | 
|  | #   define PREFIX_LENGTH_4 24 | 
|  | uint32_t countAlgRanges; | 
|  |  | 
|  | static AlgorithmicRange cjkExtA={ | 
|  | 0x3400, 0x4db5, | 
|  | 0, 4, | 
|  | sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 | 
|  | }; | 
|  | static AlgorithmicRange cjk={ | 
|  | 0x4e00, 0x9fa5, | 
|  | 0, 4, | 
|  | sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 | 
|  | }; | 
|  | static AlgorithmicRange cjkExtB={ | 
|  | 0x20000, 0x2a6d6, | 
|  | 0, 5, | 
|  | sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 | 
|  | }; | 
|  |  | 
|  | static char jamo[]= | 
|  | "HANGUL SYLLABLE \0" | 
|  |  | 
|  | "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0" | 
|  | "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0" | 
|  |  | 
|  | "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0" | 
|  | "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0" | 
|  | "YU\0EU\0YI\0I\0" | 
|  |  | 
|  | "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0" | 
|  | "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0" | 
|  | "S\0SS\0NG\0J\0C\0K\0T\0P\0H" | 
|  | ; | 
|  |  | 
|  | static AlgorithmicRange hangul={ | 
|  | 0xac00, 0xd7a3, | 
|  | 1, 3, | 
|  | sizeof(AlgorithmicRange)+6+sizeof(jamo) | 
|  | }; | 
|  |  | 
|  | /* modulo factors, maximum 8 */ | 
|  | /* 3 factors: 19, 21, 28, most-to-least-significant */ | 
|  | static uint16_t hangulFactors[3]={ | 
|  | 19, 21, 28 | 
|  | }; | 
|  |  | 
|  | uint32_t size; | 
|  |  | 
|  | size=0; | 
|  |  | 
|  | /* number of ranges of algorithmic names */ | 
|  | if(uprv_memcmp(dataInfo.dataVersion, unicode_3_1, sizeof(UVersionInfo))>=0) { | 
|  | /* Unicode 3.1 and up has 4 ranges including CJK Extension B */ | 
|  | countAlgRanges=4; | 
|  | } else if(uprv_memcmp(dataInfo.dataVersion, unicode_3_0, sizeof(UVersionInfo))>=0) { | 
|  | /* Unicode 3.0 has 3 ranges including CJK Extension A */ | 
|  | countAlgRanges=3; | 
|  | } else { | 
|  | /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */ | 
|  | countAlgRanges=2; | 
|  | } | 
|  |  | 
|  | if(pData!=NULL) { | 
|  | udata_write32(pData, countAlgRanges); | 
|  | } else { | 
|  | size+=4; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * each range: | 
|  | * uint32_t rangeStart | 
|  | * uint32_t rangeEnd | 
|  | * uint8_t algorithmType | 
|  | * uint8_t algorithmVariant | 
|  | * uint16_t size of range data | 
|  | * uint8_t[size] data | 
|  | */ | 
|  |  | 
|  | /* range 0: cjk extension a */ | 
|  | if(countAlgRanges>=3) { | 
|  | if(pData!=NULL) { | 
|  | udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange)); | 
|  | udata_writeString(pData, prefix, PREFIX_LENGTH); | 
|  | if(PREFIX_LENGTH<PREFIX_LENGTH_4) { | 
|  | udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH); | 
|  | } | 
|  | } else { | 
|  | size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* range 1: cjk */ | 
|  | if(pData!=NULL) { | 
|  | udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange)); | 
|  | udata_writeString(pData, prefix, PREFIX_LENGTH); | 
|  | if(PREFIX_LENGTH<PREFIX_LENGTH_4) { | 
|  | udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH); | 
|  | } | 
|  | } else { | 
|  | size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4; | 
|  | } | 
|  |  | 
|  | /* range 2: hangul syllables */ | 
|  | if(pData!=NULL) { | 
|  | udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange)); | 
|  | udata_writeBlock(pData, hangulFactors, 6); | 
|  | udata_writeString(pData, jamo, sizeof(jamo)); | 
|  | } else { | 
|  | size+=sizeof(AlgorithmicRange)+6+sizeof(jamo); | 
|  | } | 
|  |  | 
|  | /* range 3: cjk extension b */ | 
|  | if(countAlgRanges>=4) { | 
|  | if(pData!=NULL) { | 
|  | udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange)); | 
|  | udata_writeString(pData, prefix, PREFIX_LENGTH); | 
|  | if(PREFIX_LENGTH<PREFIX_LENGTH_4) { | 
|  | udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH); | 
|  | } | 
|  | } else { | 
|  | size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4; | 
|  | } | 
|  | } | 
|  |  | 
|  | return size; | 
|  | } | 
|  |  | 
|  | /* helpers ------------------------------------------------------------------ */ | 
|  |  | 
|  | static int16_t | 
|  | findToken(uint8_t *s, int16_t length) { | 
|  | int16_t i, token; | 
|  |  | 
|  | for(i=0; i<(int16_t)tokenCount; ++i) { | 
|  | token=tokens[i]; | 
|  | if(token!=-1 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) { | 
|  | return i; | 
|  | } | 
|  | } | 
|  |  | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | static Word * | 
|  | findWord(char *s, int16_t length) { | 
|  | uint32_t i; | 
|  |  | 
|  | for(i=0; i<wordCount; ++i) { | 
|  | if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) { | 
|  | return words+i; | 
|  | } | 
|  | } | 
|  |  | 
|  | return NULL; | 
|  | } | 
|  |  | 
|  | static Word * | 
|  | addWord(char *s, int16_t length) { | 
|  | uint8_t *stringStart; | 
|  | Word *word; | 
|  |  | 
|  | if(wordCount==MAX_WORD_COUNT) { | 
|  | fprintf(stderr, "gennames: too many words\n"); | 
|  | exit(U_BUFFER_OVERFLOW_ERROR); | 
|  | } | 
|  |  | 
|  | stringStart=allocWord(length); | 
|  | uprv_memcpy(stringStart, s, length); | 
|  |  | 
|  | word=words+wordCount; | 
|  |  | 
|  | /* | 
|  | * Initialize the weight with the costs for this token: | 
|  | * a zero-terminated string and a 16-bit offset. | 
|  | */ | 
|  | word->weight=-(length+1+2); | 
|  | word->count=0; | 
|  | word->length=length; | 
|  | word->s=stringStart; | 
|  |  | 
|  | ++wordCount; | 
|  |  | 
|  | return word; | 
|  | } | 
|  |  | 
|  | static void | 
|  | countWord(Word *word) { | 
|  | /* add to the weight the savings: the length of the word minus 1 byte for the token */ | 
|  | word->weight+=word->length-1; | 
|  | ++word->count; | 
|  | } | 
|  |  | 
|  | static void | 
|  | addLine(uint32_t code, char *name1, int16_t name1Length, char *name2, int16_t name2Length) { | 
|  | uint8_t *stringStart; | 
|  | Line *line; | 
|  | int16_t length; | 
|  |  | 
|  | if(lineCount==MAX_LINE_COUNT) { | 
|  | fprintf(stderr, "gennames: too many lines\n"); | 
|  | exit(U_BUFFER_OVERFLOW_ERROR); | 
|  | } | 
|  |  | 
|  | length=name1Length; | 
|  | if(name2Length>0) { | 
|  | length=(int16_t)(length+1+name2Length); | 
|  | } | 
|  |  | 
|  | stringStart=allocLine(length); | 
|  | if(name1Length>0) { | 
|  | uprv_memcpy(stringStart, name1, name1Length); | 
|  | } | 
|  | if(name2Length>0) { | 
|  | stringStart[name1Length]=NAME_SEPARATOR_CHAR; | 
|  | uprv_memcpy(stringStart+name1Length+1, name2, name2Length); | 
|  | } | 
|  |  | 
|  | line=lines+lineCount; | 
|  |  | 
|  | line->code=code; | 
|  | line->length=length; | 
|  | line->s=stringStart; | 
|  |  | 
|  | ++lineCount; | 
|  |  | 
|  | /* prevent a character value that is actually in a name from becoming a token */ | 
|  | while(length>0) { | 
|  | tokens[stringStart[--length]]=-1; | 
|  | } | 
|  | } | 
|  |  | 
|  | static void | 
|  | addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) { | 
|  | uint8_t *stringStart; | 
|  | Line *line; | 
|  |  | 
|  | if(lineCount==MAX_LINE_COUNT) { | 
|  | fprintf(stderr, "gennames: too many groups\n"); | 
|  | exit(U_BUFFER_OVERFLOW_ERROR); | 
|  | } | 
|  |  | 
|  | /* store the line lengths first, then the strings */ | 
|  | lineLengthsTop=(lineLengthsTop+1)/2; | 
|  | stringStart=allocLine(lineLengthsTop+length); | 
|  | uprv_memcpy(stringStart, lineLengths, lineLengthsTop); | 
|  | uprv_memcpy(stringStart+lineLengthsTop, strings, length); | 
|  |  | 
|  | line=lines+lineCount; | 
|  |  | 
|  | line->code=groupMSB; | 
|  | line->length=length; | 
|  | line->s=stringStart; | 
|  |  | 
|  | ++lineCount; | 
|  | } | 
|  |  | 
|  | static uint32_t | 
|  | addToken(uint8_t *s, int16_t length) { | 
|  | uint8_t *stringStart; | 
|  |  | 
|  | stringStart=allocLine(length+1); | 
|  | uprv_memcpy(stringStart, s, length); | 
|  | stringStart[length]=0; | 
|  |  | 
|  | return stringStart-stringStore; | 
|  | } | 
|  |  | 
|  | static void | 
|  | appendLineLength(int16_t length) { | 
|  | if(length>=76) { | 
|  | fprintf(stderr, "gennames: compressed line too long\n"); | 
|  | exit(U_BUFFER_OVERFLOW_ERROR); | 
|  | } | 
|  | if(length>=12) { | 
|  | length-=12; | 
|  | appendLineLengthNibble((uint8_t)((length>>4)|12)); | 
|  | } | 
|  | appendLineLengthNibble((uint8_t)length); | 
|  | } | 
|  |  | 
|  | static void | 
|  | appendLineLengthNibble(uint8_t nibble) { | 
|  | if((lineLengthsTop&1)==0) { | 
|  | lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4); | 
|  | } else { | 
|  | lineLengths[lineLengthsTop/2]|=nibble&0xf; | 
|  | } | 
|  | ++lineLengthsTop; | 
|  | } | 
|  |  | 
|  | static uint8_t * | 
|  | allocLine(uint32_t length) { | 
|  | uint32_t top=lineTop+length; | 
|  | uint8_t *p; | 
|  |  | 
|  | if(top>wordBottom) { | 
|  | fprintf(stderr, "gennames: out of memory\n"); | 
|  | exit(U_MEMORY_ALLOCATION_ERROR); | 
|  | } | 
|  | p=stringStore+lineTop; | 
|  | lineTop=top; | 
|  | return p; | 
|  | } | 
|  |  | 
|  | static uint8_t * | 
|  | allocWord(uint32_t length) { | 
|  | uint32_t bottom=wordBottom-length; | 
|  |  | 
|  | if(lineTop>bottom) { | 
|  | fprintf(stderr, "gennames: out of memory\n"); | 
|  | exit(U_MEMORY_ALLOCATION_ERROR); | 
|  | } | 
|  | wordBottom=bottom; | 
|  | return stringStore+bottom; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Hey, Emacs, please set the following: | 
|  | * | 
|  | * Local Variables: | 
|  | * indent-tabs-mode: nil | 
|  | * End: | 
|  | * | 
|  | */ |