source/tools/chartype/chartbld.cpp - external/github.com/unicode-org/icu - Git at Google

 /*
  *
  *   Copyright (C) 1996-2001, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  *
  * @version	1.0 12/12/96
  * @author	Helena Shih
  * Based on Taligent international support for C++
  */

 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>

 #ifndef _CHARTBLD
 #include "chartbld.h"
 #endif

 #if U_IOSTREAM_SOURCE >= 199711
 #include <iostream>
 using namespace std;
 #elif U_IOSTREAM_SOURCE >= 198506
 #include <iostream.h>
 #endif

 const char CharTypeBuilder::tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
 const int16_t CharTypeBuilder::tagValues[] =
 	{
 	/* Mn */ (int16_t)NON_SPACING_MARK,
 	/* Mc */ (int16_t)COMBINING_SPACING_MARK,
 	/* Me */ (int16_t)ENCLOSING_MARK,
 	/* Nd */ (int16_t)DECIMAL_DIGIT_NUMBER,
 	/* Nl */ (int16_t)LETTER_NUMBER,
 	/* No */ (int16_t)OTHER_NUMBER,
 	/* Zs */ (int16_t)SPACE_SEPARATOR,
 	/* Zl */ (int16_t)LINE_SEPARATOR,
 	/* Zp */ (int16_t)PARAGRAPH_SEPARATOR,
 	/* Cc */ (int16_t)CONTROL,
 	/* Cf */ (int16_t)FORMAT,
 	/* Cs */ (int16_t)SURROGATE,
 	/* Co */ (int16_t)PRIVATE_USE,
 	/* Cn */ (int16_t)UNASSIGNED,
 	/* Lu */ (int16_t)UPPERCASE_LETTER,
 	/* Ll */ (int16_t)LOWERCASE_LETTER,
 	/* Lt */ (int16_t)TITLECASE_LETTER,
 	/* Lm */ (int16_t)MODIFIER_LETTER,
 	/* Lo */ (int16_t)OTHER_LETTER,
 	/* Pc */ (int16_t)CONNECTOR_PUNCTUATION,
 	/* Pd */ (int16_t)DASH_PUNCTUATION,
 	/* Ps */ (int16_t)START_PUNCTUATION,
 	/* Pe */ (int16_t)END_PUNCTUATION,
 	/* Po */ (int16_t)OTHER_PUNCTUATION,
 	/* Sm */ (int16_t)MATH_SYMBOL,
 	/* Sc */ (int16_t)CURRENCY_SYMBOL,
 	/* Sk */ (int16_t)MODIFIER_SYMBOL,
 	/* So */ (int16_t)OTHER_SYMBOL,
 	/* Pi */ (int16_t)INITIAL_PUNCTUATION,
 	/* Pf */ (int16_t)FINAL_PUNCTUATION
 	};

 	const UChar CharTypeBuilder:: LAST_CHAR_CODE_IN_FILE = 0xFFFD;

 CompactByteArray* CharTypeBuilder::charTypeArray = 0;
 int
 CharTypeBuilder::MakeProp(char* str)
 {
 	int result = 0;
 	char* matchPosition;

 	matchPosition = strstr(tagStrings, str);
 	if (matchPosition == 0) fprintf(stderr, "unrecognized type letter %s\n", str);
 	else result = ((matchPosition - tagStrings) / 2);
 	return result;
 }

 CompactByteArray*
 CharTypeBuilder::getByteArray(FILE* input)
 {
 	if (charTypeArray == 0) {
 		char	buffer[1000];
 		char*	bufferPtr;

 		try {
 			charTypeArray = ucmp8_open((int8_t)CharTypeBuilder::UNASSIGNED);
 			int32_t unicode;
 			while (TRUE) {
 				bufferPtr = fgets(buffer, 999, input);
 				if (bufferPtr == NULL) break;
 				if (bufferPtr[0] == '#' || bufferPtr[0] == '\n' || bufferPtr[0] == 0) continue;
 				sscanf(bufferPtr, "%X", &unicode);
 				assert(0 <= unicode && unicode < 65536);
 				bufferPtr = strchr(bufferPtr, ';');
 				assert(bufferPtr != NULL);
 				bufferPtr = strchr(bufferPtr + 1, ';');	// go to start of third field
 				assert(bufferPtr != NULL);
 				bufferPtr++;
 				bufferPtr[2] = 0;
 				ucmp8_set(charTypeArray, (UChar)unicode, (int8_t)tagValues[MakeProp(bufferPtr)]);
 				if (unicode == LAST_CHAR_CODE_IN_FILE)
 					break;
 			}
             /* Check the database to see if this needs to be updated!!! */
 			ucmp8_setRange(charTypeArray, 0x3401, 0x4db4, ucmp8_get(charTypeArray, 0x3400));
 			ucmp8_setRange(charTypeArray, 0x4e01, 0x9fa4, ucmp8_get(charTypeArray, 0x4e00));
 			ucmp8_setRange(charTypeArray, 0xac01, 0xd7a2, ucmp8_get(charTypeArray, 0xac00));
 			ucmp8_setRange(charTypeArray, 0xd801, 0xdb7e, ucmp8_get(charTypeArray, 0xd800));
 			ucmp8_setRange(charTypeArray, 0xdb81, 0xdbfe, ucmp8_get(charTypeArray, 0xdb80));
 			ucmp8_setRange(charTypeArray, 0xdc01, 0xdffe, ucmp8_get(charTypeArray, 0xdc00));
 			ucmp8_setRange(charTypeArray, 0xe001, 0xf8fe, ucmp8_get(charTypeArray, 0xe000));

 			if (input) fclose(input);
 			ucmp8_compact(charTypeArray, 1);
 		}
 		catch (...) {
 			fprintf(stderr, "Error Occured while parsing unicode data file.\n");
 		}
 	}
 	return charTypeArray;
 }

 void
 CharTypeBuilder::writeByteArrays()
 {
 	const int8_t* values = ucmp8_getArray(charTypeArray);
 	const uint16_t* indexes = ucmp8_getIndex(charTypeArray);
 	int32_t i;
     int32_t cnt = ucmp8_getCount(charTypeArray);
     cout << "\nconst unsigned short Unicode::indicies[] = {\n    ";
     for (i = 0; i < ucmp8_getkIndexCount()-1; i++)
     {
         cout << "(uint16_t)" << ((indexes[i] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp8_getkUnicodeCount()))
                          << ", ";
         if (i != 0)
             if (i % 3 == 0)
                 cout << "\n    ";
     }
     cout << "    (uint16_t)" << ((indexes[ucmp8_getkIndexCount()-1] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp8_getkUnicodeCount()))
                        << " };\n";
     cout << "\nconst char Unicode::values[] = {\n    ";
     for (i = 0; i < cnt-1; i++)
     {
         cout << "(int8_t)" << (int)values[i] << ", ";
         if (i != 0)
             if (i % 5 == 0)
                 cout << "\n    ";
     }
     cout << "    (int8_t)" << (int)values[cnt-1] << " }\n";
 	cout << "const short Unicode::offsetCount = " << cnt << ";\n";
 }
 /**
  * The main function builds the CharType data array and prints it to System.out
  */
 int main(int argc, char** argv)
 {
     if (argc != 2) {
         printf("Usage : chartype filename\n\n");
         exit(1);
     }
     FILE *input = fopen(argv[1], "r");
     if (input == 0) {
         printf("Cannot open the input file: %s\n\n", argv[1]);
         exit(1);
     }
 	CompactByteArray* arrays = CharTypeBuilder::getByteArray(input);
 	CharTypeBuilder::writeByteArrays();
     return 0;
 }
	/*
	*
	* Copyright (C) 1996-2001, International Business Machines
	* Corporation and others. All Rights Reserved.
	*
	* @version 1.0 12/12/96
	* @author Helena Shih
	* Based on Taligent international support for C++
	*/

	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <assert.h>

	#ifndef _CHARTBLD
	#include "chartbld.h"
	#endif

	#if U_IOSTREAM_SOURCE >= 199711
	#include <iostream>
	using namespace std;
	#elif U_IOSTREAM_SOURCE >= 198506
	#include <iostream.h>
	#endif

	const char CharTypeBuilder::tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
	const int16_t CharTypeBuilder::tagValues[] =
	{
	/* Mn */ (int16_t)NON_SPACING_MARK,
	/* Mc */ (int16_t)COMBINING_SPACING_MARK,
	/* Me */ (int16_t)ENCLOSING_MARK,
	/* Nd */ (int16_t)DECIMAL_DIGIT_NUMBER,
	/* Nl */ (int16_t)LETTER_NUMBER,
	/* No */ (int16_t)OTHER_NUMBER,
	/* Zs */ (int16_t)SPACE_SEPARATOR,
	/* Zl */ (int16_t)LINE_SEPARATOR,
	/* Zp */ (int16_t)PARAGRAPH_SEPARATOR,
	/* Cc */ (int16_t)CONTROL,
	/* Cf */ (int16_t)FORMAT,
	/* Cs */ (int16_t)SURROGATE,
	/* Co */ (int16_t)PRIVATE_USE,
	/* Cn */ (int16_t)UNASSIGNED,
	/* Lu */ (int16_t)UPPERCASE_LETTER,
	/* Ll */ (int16_t)LOWERCASE_LETTER,
	/* Lt */ (int16_t)TITLECASE_LETTER,
	/* Lm */ (int16_t)MODIFIER_LETTER,
	/* Lo */ (int16_t)OTHER_LETTER,
	/* Pc */ (int16_t)CONNECTOR_PUNCTUATION,
	/* Pd */ (int16_t)DASH_PUNCTUATION,
	/* Ps */ (int16_t)START_PUNCTUATION,
	/* Pe */ (int16_t)END_PUNCTUATION,
	/* Po */ (int16_t)OTHER_PUNCTUATION,
	/* Sm */ (int16_t)MATH_SYMBOL,
	/* Sc */ (int16_t)CURRENCY_SYMBOL,
	/* Sk */ (int16_t)MODIFIER_SYMBOL,
	/* So */ (int16_t)OTHER_SYMBOL,
	/* Pi */ (int16_t)INITIAL_PUNCTUATION,
	/* Pf */ (int16_t)FINAL_PUNCTUATION
	};

	const UChar CharTypeBuilder:: LAST_CHAR_CODE_IN_FILE = 0xFFFD;

	CompactByteArray* CharTypeBuilder::charTypeArray = 0;
	int
	CharTypeBuilder::MakeProp(char* str)
	{
	int result = 0;
	char* matchPosition;

	matchPosition = strstr(tagStrings, str);
	if (matchPosition == 0) fprintf(stderr, "unrecognized type letter %s\n", str);
	else result = ((matchPosition - tagStrings) / 2);
	return result;
	}

	CompactByteArray*
	CharTypeBuilder::getByteArray(FILE* input)
	{
	if (charTypeArray == 0) {
	char buffer[1000];
	char* bufferPtr;

	try {
	charTypeArray = ucmp8_open((int8_t)CharTypeBuilder::UNASSIGNED);
	int32_t unicode;
	while (TRUE) {
	bufferPtr = fgets(buffer, 999, input);
	if (bufferPtr == NULL) break;
	if (bufferPtr[0] == '#' \|\| bufferPtr[0] == '\n' \|\| bufferPtr[0] == 0) continue;
	sscanf(bufferPtr, "%X", &unicode);
	assert(0 <= unicode && unicode < 65536);
	bufferPtr = strchr(bufferPtr, ';');
	assert(bufferPtr != NULL);
	bufferPtr = strchr(bufferPtr + 1, ';'); // go to start of third field
	assert(bufferPtr != NULL);
	bufferPtr++;
	bufferPtr[2] = 0;
	ucmp8_set(charTypeArray, (UChar)unicode, (int8_t)tagValues[MakeProp(bufferPtr)]);
	if (unicode == LAST_CHAR_CODE_IN_FILE)
	break;
	}
	/* Check the database to see if this needs to be updated!!! */
	ucmp8_setRange(charTypeArray, 0x3401, 0x4db4, ucmp8_get(charTypeArray, 0x3400));
	ucmp8_setRange(charTypeArray, 0x4e01, 0x9fa4, ucmp8_get(charTypeArray, 0x4e00));
	ucmp8_setRange(charTypeArray, 0xac01, 0xd7a2, ucmp8_get(charTypeArray, 0xac00));
	ucmp8_setRange(charTypeArray, 0xd801, 0xdb7e, ucmp8_get(charTypeArray, 0xd800));
	ucmp8_setRange(charTypeArray, 0xdb81, 0xdbfe, ucmp8_get(charTypeArray, 0xdb80));
	ucmp8_setRange(charTypeArray, 0xdc01, 0xdffe, ucmp8_get(charTypeArray, 0xdc00));
	ucmp8_setRange(charTypeArray, 0xe001, 0xf8fe, ucmp8_get(charTypeArray, 0xe000));

	if (input) fclose(input);
	ucmp8_compact(charTypeArray, 1);
	}
	catch (...) {
	fprintf(stderr, "Error Occured while parsing unicode data file.\n");
	}
	}
	return charTypeArray;
	}

	void
	CharTypeBuilder::writeByteArrays()
	{
	const int8_t* values = ucmp8_getArray(charTypeArray);
	const uint16_t* indexes = ucmp8_getIndex(charTypeArray);
	int32_t i;
	int32_t cnt = ucmp8_getCount(charTypeArray);
	cout << "\nconst unsigned short Unicode::indicies[] = {\n ";
	for (i = 0; i < ucmp8_getkIndexCount()-1; i++)
	{
	cout << "(uint16_t)" << ((indexes[i] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp8_getkUnicodeCount()))
	<< ", ";
	if (i != 0)
	if (i % 3 == 0)
	cout << "\n ";
	}
	cout << " (uint16_t)" << ((indexes[ucmp8_getkIndexCount()-1] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp8_getkUnicodeCount()))
	<< " };\n";
	cout << "\nconst char Unicode::values[] = {\n ";
	for (i = 0; i < cnt-1; i++)
	{
	cout << "(int8_t)" << (int)values[i] << ", ";
	if (i != 0)
	if (i % 5 == 0)
	cout << "\n ";
	}
	cout << " (int8_t)" << (int)values[cnt-1] << " }\n";
	cout << "const short Unicode::offsetCount = " << cnt << ";\n";
	}
	/**
	* The main function builds the CharType data array and prints it to System.out
	*/
	int main(int argc, char** argv)
	{
	if (argc != 2) {
	printf("Usage : chartype filename\n\n");
	exit(1);
	}
	FILE *input = fopen(argv[1], "r");
	if (input == 0) {
	printf("Cannot open the input file: %s\n\n", argv[1]);
	exit(1);
	}
	CompactByteArray* arrays = CharTypeBuilder::getByteArray(input);
	CharTypeBuilder::writeByteArrays();
	return 0;
	}