source/tools/ulxfrm/ulxfrm.cpp - external/github.com/unicode-org/icu - Git at Google

 /*
  * (C) Copyright IBM Corp. 1998 - All Rights Reserved
  * @version	1.0 06/19/98
  * @author	Helena Shih
  * Based on Taligent international support for C++
  */

 #include <stdio.h>
 #include <stdlib.h>
 #include <iostream.h>
 #include <string.h>
 #include <assert.h>

 #include "ucmp16.h"
 CompactShortArray* ulxfrmArray = 0;

 	enum	ECharTypeMapping {
 		UNASSIGNED				= 0,
 	UPPERCASE_LETTER		= 1,
 	LOWERCASE_LETTER		= 2,
 	TITLECASE_LETTER		= 3,
 	MODIFIER_LETTER			= 4,
 	OTHER_LETTER			= 5,
 	NON_SPACING_MARK		= 6,
 	ENCLOSING_MARK			= 7,
 	COMBINING_SPACING_MARK	= 8,
 	DECIMAL_DIGIT_NUMBER	= 9,
 	LETTER_NUMBER			= 10,
 	OTHER_NUMBER			= 11,
 	SPACE_SEPARATOR			= 12,
 	LINE_SEPARATOR			= 13,
 	PARAGRAPH_SEPARATOR		= 14,
 	CONTROL					= 15,
 	FORMAT					= 16,
 	PRIVATE_USE				= 17,
 	SURROGATE				= 18,
 	DASH_PUNCTUATION		= 19,
 	START_PUNCTUATION		= 20,
 	END_PUNCTUATION			= 21,
 	CONNECTOR_PUNCTUATION	= 22,
 	OTHER_PUNCTUATION		= 23,
 	MATH_SYMBOL				= 24,
 	CURRENCY_SYMBOL			= 25,
 	MODIFIER_SYMBOL			= 26,
 	OTHER_SYMBOL			= 27,
 	INITIAL_PUNCTUATION     = 28,
 	FINAL_PUNCTUATION       = 29
 	};

 static const UChar LAST_CHAR_CODE_IN_FILE = 0xFFFD;
 const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
 const int16_t tagValues[] =
 	{
 	/* Mn */ (int16_t)NON_SPACING_MARK,
 	/* Mc */ (int16_t)COMBINING_SPACING_MARK,
 	/* Me */ (int16_t)ENCLOSING_MARK,
 	/* Nd */ (int16_t)DECIMAL_DIGIT_NUMBER,
 	/* Nl */ (int16_t)LETTER_NUMBER,
 	/* No */ (int16_t)OTHER_NUMBER,
 	/* Zs */ (int16_t)SPACE_SEPARATOR,
 	/* Zl */ (int16_t)LINE_SEPARATOR,
 	/* Zp */ (int16_t)PARAGRAPH_SEPARATOR,
 	/* Cc */ (int16_t)CONTROL,
 	/* Cf */ (int16_t)FORMAT,
 	/* Cs */ (int16_t)SURROGATE,
 	/* Co */ (int16_t)PRIVATE_USE,
 	/* Cn */ (int16_t)UNASSIGNED,
 	/* Lu */ (int16_t)UPPERCASE_LETTER,
 	/* Ll */ (int16_t)LOWERCASE_LETTER,
 	/* Lt */ (int16_t)TITLECASE_LETTER,
 	/* Lm */ (int16_t)MODIFIER_LETTER,
 	/* Lo */ (int16_t)OTHER_LETTER,
 	/* Pc */ (int16_t)CONNECTOR_PUNCTUATION,
 	/* Pd */ (int16_t)DASH_PUNCTUATION,
 	/* Ps */ (int16_t)START_PUNCTUATION,
 	/* Pe */ (int16_t)END_PUNCTUATION,
 	/* Po */ (int16_t)OTHER_PUNCTUATION,
 	/* Sm */ (int16_t)MATH_SYMBOL,
 	/* Sc */ (int16_t)CURRENCY_SYMBOL,
 	/* Sk */ (int16_t)MODIFIER_SYMBOL,
 	/* So */ (int16_t)OTHER_SYMBOL,
 	/* Pi */ (int16_t)INITIAL_PUNCTUATION,
 	/* Pf */ (int16_t)FINAL_PUNCTUATION
 	};
 int
 MakeProp(char* str)
 {
 	int result = 0;
 	char* matchPosition;

 	matchPosition = strstr(tagStrings, str);
 	if (matchPosition == 0) fprintf(stderr, "unrecognized type letter %s", str);
 	else result = ((matchPosition - tagStrings) / 2);
 	return result;
 }

 CompactShortArray*
 getArray(FILE *input)
 {
 	if (ulxfrmArray == 0) {
 		char	buffer[1000];
 		char*	bufferPtr;
         int  set = FALSE;
         char type[3];

 		try {
 			ulxfrmArray = ucmp16_open((int16_t)0xffff);
 			int32_t unicode, otherunicode, digit, i;
 			while (TRUE) {
                 otherunicode = 0xffff;
                 digit = -1;
 				bufferPtr = fgets(buffer, 999, input);
 				if (bufferPtr == NULL) break;
 				if (bufferPtr[0] == '#' || bufferPtr[0] == '\n' || bufferPtr[0] == 0) continue;
 				sscanf(bufferPtr, "%X", &unicode);
 				assert(0 <= unicode && unicode < 65536);
 				bufferPtr = strchr(bufferPtr, ';');
 				assert(bufferPtr != NULL);
                 bufferPtr = strchr(bufferPtr + 1, ';');
 				strncpy(type, ++bufferPtr, 2);	// go to start of third field
 				assert(type != NULL);
 				type[2] = 0;
   				int typeResult = tagValues[MakeProp(type)];
                 // check for the decimal values
                 bufferPtr++;
                 for (i = 3; i < 8; i++) {
 				    bufferPtr = strchr(bufferPtr, ';');
 				    assert(bufferPtr != NULL);
 				    bufferPtr++;
                 }
     	    	sscanf(bufferPtr, "%X", &digit);
                 if (((typeResult == DECIMAL_DIGIT_NUMBER) || (typeResult == OTHER_NUMBER)) &&
                     (digit >= 0 && digit <= 9)){
                     buffer[10];
                     sprintf(buffer, "0x%04X", unicode);
                     cout << "    { " << buffer << ", " << digit << "}, \n";
                 }
                 bufferPtr++;
                 for (i = 8; i < 12; i++) {
 				    bufferPtr = strchr(bufferPtr, ';');
 				    assert(bufferPtr != NULL);
 				    bufferPtr++;
                 }
 				sscanf(bufferPtr, "%X", &otherunicode);
                 // the Unicode char has a equivalent uppercase
                 if ((typeResult == LOWERCASE_LETTER) && (0 <= otherunicode && otherunicode < 65536)) {
                     set = TRUE;
                 }
                 if ((typeResult == UPPERCASE_LETTER) && !set) {
                     bufferPtr++;
 			    	sscanf(bufferPtr, "%X", &otherunicode);
                     if (0 <= otherunicode && otherunicode < 65536) {
                         set = TRUE;
                     }
                 }
 				if ((set == TRUE) && (ucmp16_get(ulxfrmArray, (UChar)unicode) == (int16_t)0xffff))
 					ucmp16_set(ulxfrmArray, (UChar)unicode, (int16_t)otherunicode);
                 set = FALSE;
                 }

 			if (input) fclose(input);
 			ucmp16_compact(ulxfrmArray);
 		}
 		catch (...) {
 			fprintf(stderr, "Error Occured while parsing unicode data file.\n");
 		}
 	}
 	return ulxfrmArray;
 }

 void
 writeArrays()
 {
 	const int16_t* values = ucmp16_getArray(ulxfrmArray);
 	const uint16_t* indexes = ucmp16_getIndex(ulxfrmArray);
 	int32_t i;
     int32_t cnt = ucmp16_getCount(ulxfrmArray);
     cout << "\nconst uint32_t Unicode::caseIndex[] = {\n    ";
     for (i = 0; i < ucmp16_getkIndexCount()-1; i++)
     {
         cout << "(uint16_t)" << ((indexes[i] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp16_getkUnicodeCount()))
                          << ", ";
         if (i != 0)
             if (i % 3 == 0)
                 cout << "\n    ";
     }
     cout << "    (uint16_t)" << ((indexes[ucmp16_getkIndexCount()-1] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp16_getkUnicodeCount()))
                        << " };\n";
     cout << "\nconst int16_t Unicode::caseValues[] = {\n    ";
     for (i = 0; i < cnt-1; i++)
     {
         cout << "(int16_t)" << (int16_t)values[i] << ", ";
         if (i != 0)
             if (i % 5 == 0)
                 cout << "\n    ";
     }
     cout << "    (char)" << (int16_t)values[cnt-1] << " }\n";
 	cout << "const int32_t Unicode::caseCount = " << cnt << ";\n";
 }
 /**
  * The main function builds the CharType data array and prints it to System.out
  */
 void main(int argc, char** argv)
 {
 	CompactShortArray* arrays = 0;
     FILE *input = 0;
     if (argc != 2) {
         printf("Usage : chartype filename\n\n");
         exit(1);
     }
     input = fopen(argv[1], "r");
     if (input == 0) {
         printf("Cannot open the input file: %s\n\n", argv[1]);
         exit(1);
     }
 	arrays = getArray(input);
 	writeArrays();
 }
	/*
	* (C) Copyright IBM Corp. 1998 - All Rights Reserved
	* @version 1.0 06/19/98
	* @author Helena Shih
	* Based on Taligent international support for C++
	*/

	#include <stdio.h>
	#include <stdlib.h>
	#include <iostream.h>
	#include <string.h>
	#include <assert.h>

	#include "ucmp16.h"
	CompactShortArray* ulxfrmArray = 0;

	enum ECharTypeMapping {
	UNASSIGNED = 0,
	UPPERCASE_LETTER = 1,
	LOWERCASE_LETTER = 2,
	TITLECASE_LETTER = 3,
	MODIFIER_LETTER = 4,
	OTHER_LETTER = 5,
	NON_SPACING_MARK = 6,
	ENCLOSING_MARK = 7,
	COMBINING_SPACING_MARK = 8,
	DECIMAL_DIGIT_NUMBER = 9,
	LETTER_NUMBER = 10,
	OTHER_NUMBER = 11,
	SPACE_SEPARATOR = 12,
	LINE_SEPARATOR = 13,
	PARAGRAPH_SEPARATOR = 14,
	CONTROL = 15,
	FORMAT = 16,
	PRIVATE_USE = 17,
	SURROGATE = 18,
	DASH_PUNCTUATION = 19,
	START_PUNCTUATION = 20,
	END_PUNCTUATION = 21,
	CONNECTOR_PUNCTUATION = 22,
	OTHER_PUNCTUATION = 23,
	MATH_SYMBOL = 24,
	CURRENCY_SYMBOL = 25,
	MODIFIER_SYMBOL = 26,
	OTHER_SYMBOL = 27,
	INITIAL_PUNCTUATION = 28,
	FINAL_PUNCTUATION = 29
	};

	static const UChar LAST_CHAR_CODE_IN_FILE = 0xFFFD;
	const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
	const int16_t tagValues[] =
	{
	/* Mn */ (int16_t)NON_SPACING_MARK,
	/* Mc */ (int16_t)COMBINING_SPACING_MARK,
	/* Me */ (int16_t)ENCLOSING_MARK,
	/* Nd */ (int16_t)DECIMAL_DIGIT_NUMBER,
	/* Nl */ (int16_t)LETTER_NUMBER,
	/* No */ (int16_t)OTHER_NUMBER,
	/* Zs */ (int16_t)SPACE_SEPARATOR,
	/* Zl */ (int16_t)LINE_SEPARATOR,
	/* Zp */ (int16_t)PARAGRAPH_SEPARATOR,
	/* Cc */ (int16_t)CONTROL,
	/* Cf */ (int16_t)FORMAT,
	/* Cs */ (int16_t)SURROGATE,
	/* Co */ (int16_t)PRIVATE_USE,
	/* Cn */ (int16_t)UNASSIGNED,
	/* Lu */ (int16_t)UPPERCASE_LETTER,
	/* Ll */ (int16_t)LOWERCASE_LETTER,
	/* Lt */ (int16_t)TITLECASE_LETTER,
	/* Lm */ (int16_t)MODIFIER_LETTER,
	/* Lo */ (int16_t)OTHER_LETTER,
	/* Pc */ (int16_t)CONNECTOR_PUNCTUATION,
	/* Pd */ (int16_t)DASH_PUNCTUATION,
	/* Ps */ (int16_t)START_PUNCTUATION,
	/* Pe */ (int16_t)END_PUNCTUATION,
	/* Po */ (int16_t)OTHER_PUNCTUATION,
	/* Sm */ (int16_t)MATH_SYMBOL,
	/* Sc */ (int16_t)CURRENCY_SYMBOL,
	/* Sk */ (int16_t)MODIFIER_SYMBOL,
	/* So */ (int16_t)OTHER_SYMBOL,
	/* Pi */ (int16_t)INITIAL_PUNCTUATION,
	/* Pf */ (int16_t)FINAL_PUNCTUATION
	};
	int
	MakeProp(char* str)
	{
	int result = 0;
	char* matchPosition;

	matchPosition = strstr(tagStrings, str);
	if (matchPosition == 0) fprintf(stderr, "unrecognized type letter %s", str);
	else result = ((matchPosition - tagStrings) / 2);
	return result;
	}

	CompactShortArray*
	getArray(FILE *input)
	{
	if (ulxfrmArray == 0) {
	char buffer[1000];
	char* bufferPtr;
	int set = FALSE;
	char type[3];

	try {
	ulxfrmArray = ucmp16_open((int16_t)0xffff);
	int32_t unicode, otherunicode, digit, i;
	while (TRUE) {
	otherunicode = 0xffff;
	digit = -1;
	bufferPtr = fgets(buffer, 999, input);
	if (bufferPtr == NULL) break;
	if (bufferPtr[0] == '#' \|\| bufferPtr[0] == '\n' \|\| bufferPtr[0] == 0) continue;
	sscanf(bufferPtr, "%X", &unicode);
	assert(0 <= unicode && unicode < 65536);
	bufferPtr = strchr(bufferPtr, ';');
	assert(bufferPtr != NULL);
	bufferPtr = strchr(bufferPtr + 1, ';');
	strncpy(type, ++bufferPtr, 2); // go to start of third field
	assert(type != NULL);
	type[2] = 0;
	int typeResult = tagValues[MakeProp(type)];
	// check for the decimal values
	bufferPtr++;
	for (i = 3; i < 8; i++) {
	bufferPtr = strchr(bufferPtr, ';');
	assert(bufferPtr != NULL);
	bufferPtr++;
	}
	sscanf(bufferPtr, "%X", &digit);
	if (((typeResult == DECIMAL_DIGIT_NUMBER) \|\| (typeResult == OTHER_NUMBER)) &&
	(digit >= 0 && digit <= 9)){
	buffer[10];
	sprintf(buffer, "0x%04X", unicode);
	cout << " { " << buffer << ", " << digit << "}, \n";
	}
	bufferPtr++;
	for (i = 8; i < 12; i++) {
	bufferPtr = strchr(bufferPtr, ';');
	assert(bufferPtr != NULL);
	bufferPtr++;
	}
	sscanf(bufferPtr, "%X", &otherunicode);
	// the Unicode char has a equivalent uppercase
	if ((typeResult == LOWERCASE_LETTER) && (0 <= otherunicode && otherunicode < 65536)) {
	set = TRUE;
	}
	if ((typeResult == UPPERCASE_LETTER) && !set) {
	bufferPtr++;
	sscanf(bufferPtr, "%X", &otherunicode);
	if (0 <= otherunicode && otherunicode < 65536) {
	set = TRUE;
	}
	}
	if ((set == TRUE) && (ucmp16_get(ulxfrmArray, (UChar)unicode) == (int16_t)0xffff))
	ucmp16_set(ulxfrmArray, (UChar)unicode, (int16_t)otherunicode);
	set = FALSE;
	}

	if (input) fclose(input);
	ucmp16_compact(ulxfrmArray);
	}
	catch (...) {
	fprintf(stderr, "Error Occured while parsing unicode data file.\n");
	}
	}
	return ulxfrmArray;
	}

	void
	writeArrays()
	{
	const int16_t* values = ucmp16_getArray(ulxfrmArray);
	const uint16_t* indexes = ucmp16_getIndex(ulxfrmArray);
	int32_t i;
	int32_t cnt = ucmp16_getCount(ulxfrmArray);
	cout << "\nconst uint32_t Unicode::caseIndex[] = {\n ";
	for (i = 0; i < ucmp16_getkIndexCount()-1; i++)
	{
	cout << "(uint16_t)" << ((indexes[i] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp16_getkUnicodeCount()))
	<< ", ";
	if (i != 0)
	if (i % 3 == 0)
	cout << "\n ";
	}
	cout << " (uint16_t)" << ((indexes[ucmp16_getkIndexCount()-1] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp16_getkUnicodeCount()))
	<< " };\n";
	cout << "\nconst int16_t Unicode::caseValues[] = {\n ";
	for (i = 0; i < cnt-1; i++)
	{
	cout << "(int16_t)" << (int16_t)values[i] << ", ";
	if (i != 0)
	if (i % 5 == 0)
	cout << "\n ";
	}
	cout << " (char)" << (int16_t)values[cnt-1] << " }\n";
	cout << "const int32_t Unicode::caseCount = " << cnt << ";\n";
	}
	/**
	* The main function builds the CharType data array and prints it to System.out
	*/
	void main(int argc, char** argv)
	{
	CompactShortArray* arrays = 0;
	FILE *input = 0;
	if (argc != 2) {
	printf("Usage : chartype filename\n\n");
	exit(1);
	}
	input = fopen(argv[1], "r");
	if (input == 0) {
	printf("Cannot open the input file: %s\n\n", argv[1]);
	exit(1);
	}
	arrays = getArray(input);
	writeArrays();
	}