blob: 13b4a320048d7c63ceb484ff2d75ae1b51b13b97 [file] [log] [blame]
/*
*
* Copyright (C) 1996-2001, International Business Machines
* Corporation and others. All Rights Reserved.
*
* @version 1.0 12/12/96
* @author Helena Shih
* Based on Taligent international support for C++
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#ifndef _CHARTBLD
#include "chartbld.h"
#endif
#if U_IOSTREAM_SOURCE >= 199711
#include <iostream>
using namespace std;
#elif U_IOSTREAM_SOURCE >= 198506
#include <iostream.h>
#endif
const char CharTypeBuilder::tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
const int16_t CharTypeBuilder::tagValues[] =
{
/* Mn */ (int16_t)NON_SPACING_MARK,
/* Mc */ (int16_t)COMBINING_SPACING_MARK,
/* Me */ (int16_t)ENCLOSING_MARK,
/* Nd */ (int16_t)DECIMAL_DIGIT_NUMBER,
/* Nl */ (int16_t)LETTER_NUMBER,
/* No */ (int16_t)OTHER_NUMBER,
/* Zs */ (int16_t)SPACE_SEPARATOR,
/* Zl */ (int16_t)LINE_SEPARATOR,
/* Zp */ (int16_t)PARAGRAPH_SEPARATOR,
/* Cc */ (int16_t)CONTROL,
/* Cf */ (int16_t)FORMAT,
/* Cs */ (int16_t)SURROGATE,
/* Co */ (int16_t)PRIVATE_USE,
/* Cn */ (int16_t)UNASSIGNED,
/* Lu */ (int16_t)UPPERCASE_LETTER,
/* Ll */ (int16_t)LOWERCASE_LETTER,
/* Lt */ (int16_t)TITLECASE_LETTER,
/* Lm */ (int16_t)MODIFIER_LETTER,
/* Lo */ (int16_t)OTHER_LETTER,
/* Pc */ (int16_t)CONNECTOR_PUNCTUATION,
/* Pd */ (int16_t)DASH_PUNCTUATION,
/* Ps */ (int16_t)START_PUNCTUATION,
/* Pe */ (int16_t)END_PUNCTUATION,
/* Po */ (int16_t)OTHER_PUNCTUATION,
/* Sm */ (int16_t)MATH_SYMBOL,
/* Sc */ (int16_t)CURRENCY_SYMBOL,
/* Sk */ (int16_t)MODIFIER_SYMBOL,
/* So */ (int16_t)OTHER_SYMBOL,
/* Pi */ (int16_t)INITIAL_PUNCTUATION,
/* Pf */ (int16_t)FINAL_PUNCTUATION
};
const UChar CharTypeBuilder:: LAST_CHAR_CODE_IN_FILE = 0xFFFD;
CompactByteArray* CharTypeBuilder::charTypeArray = 0;
int
CharTypeBuilder::MakeProp(char* str)
{
int result = 0;
char* matchPosition;
matchPosition = strstr(tagStrings, str);
if (matchPosition == 0) fprintf(stderr, "unrecognized type letter %s\n", str);
else result = ((matchPosition - tagStrings) / 2);
return result;
}
CompactByteArray*
CharTypeBuilder::getByteArray(FILE* input)
{
if (charTypeArray == 0) {
char buffer[1000];
char* bufferPtr;
try {
charTypeArray = ucmp8_open((int8_t)CharTypeBuilder::UNASSIGNED);
int32_t unicode;
while (TRUE) {
bufferPtr = fgets(buffer, 999, input);
if (bufferPtr == NULL) break;
if (bufferPtr[0] == '#' || bufferPtr[0] == '\n' || bufferPtr[0] == 0) continue;
sscanf(bufferPtr, "%X", &unicode);
assert(0 <= unicode && unicode < 65536);
bufferPtr = strchr(bufferPtr, ';');
assert(bufferPtr != NULL);
bufferPtr = strchr(bufferPtr + 1, ';'); // go to start of third field
assert(bufferPtr != NULL);
bufferPtr++;
bufferPtr[2] = 0;
ucmp8_set(charTypeArray, (UChar)unicode, (int8_t)tagValues[MakeProp(bufferPtr)]);
if (unicode == LAST_CHAR_CODE_IN_FILE)
break;
}
/* Check the database to see if this needs to be updated!!! */
ucmp8_setRange(charTypeArray, 0x3401, 0x4db4, ucmp8_get(charTypeArray, 0x3400));
ucmp8_setRange(charTypeArray, 0x4e01, 0x9fa4, ucmp8_get(charTypeArray, 0x4e00));
ucmp8_setRange(charTypeArray, 0xac01, 0xd7a2, ucmp8_get(charTypeArray, 0xac00));
ucmp8_setRange(charTypeArray, 0xd801, 0xdb7e, ucmp8_get(charTypeArray, 0xd800));
ucmp8_setRange(charTypeArray, 0xdb81, 0xdbfe, ucmp8_get(charTypeArray, 0xdb80));
ucmp8_setRange(charTypeArray, 0xdc01, 0xdffe, ucmp8_get(charTypeArray, 0xdc00));
ucmp8_setRange(charTypeArray, 0xe001, 0xf8fe, ucmp8_get(charTypeArray, 0xe000));
if (input) fclose(input);
ucmp8_compact(charTypeArray, 1);
}
catch (...) {
fprintf(stderr, "Error Occured while parsing unicode data file.\n");
}
}
return charTypeArray;
}
void
CharTypeBuilder::writeByteArrays()
{
const int8_t* values = ucmp8_getArray(charTypeArray);
const uint16_t* indexes = ucmp8_getIndex(charTypeArray);
int32_t i;
int32_t cnt = ucmp8_getCount(charTypeArray);
cout << "\nconst unsigned short Unicode::indicies[] = {\n ";
for (i = 0; i < ucmp8_getkIndexCount()-1; i++)
{
cout << "(uint16_t)" << ((indexes[i] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp8_getkUnicodeCount()))
<< ", ";
if (i != 0)
if (i % 3 == 0)
cout << "\n ";
}
cout << " (uint16_t)" << ((indexes[ucmp8_getkIndexCount()-1] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp8_getkUnicodeCount()))
<< " };\n";
cout << "\nconst char Unicode::values[] = {\n ";
for (i = 0; i < cnt-1; i++)
{
cout << "(int8_t)" << (int)values[i] << ", ";
if (i != 0)
if (i % 5 == 0)
cout << "\n ";
}
cout << " (int8_t)" << (int)values[cnt-1] << " }\n";
cout << "const short Unicode::offsetCount = " << cnt << ";\n";
}
/**
* The main function builds the CharType data array and prints it to System.out
*/
int main(int argc, char** argv)
{
if (argc != 2) {
printf("Usage : chartype filename\n\n");
exit(1);
}
FILE *input = fopen(argv[1], "r");
if (input == 0) {
printf("Cannot open the input file: %s\n\n", argv[1]);
exit(1);
}
CompactByteArray* arrays = CharTypeBuilder::getByteArray(input);
CharTypeBuilder::writeByteArrays();
return 0;
}