| /* |
| ******************************************************************************* |
| * Copyright (C) 2010, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ******************************************************************************* |
| * file name: genuts46.cpp |
| * encoding: US-ASCII |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2010mar02 |
| * created by: Markus W. Scherer |
| * |
| * quick & dirty tool to recreate the UTS #46 data table according to the spec |
| */ |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string> |
| #include <string.h> |
| #include "unicode/utypes.h" |
| #include "unicode/errorcode.h" |
| #include "unicode/normalizer2.h" |
| #include "unicode/uniset.h" |
| #include "unicode/unistr.h" |
| #include "unicode/usetiter.h" |
| #include "unicode/usprep.h" |
| #include "sprpimpl.h" // HACK |
| |
| /** |
| * icu::ErrorCode subclass for easy UErrorCode handling. |
| * The destructor calls handleFailure() which calls exit(errorCode) when isFailure(). |
| */ |
| class ExitingErrorCode : public icu::ErrorCode { |
| public: |
| /** |
| * @param loc A short string describing where the ExitingErrorCode is used. |
| */ |
| ExitingErrorCode(const char *loc) : location(loc) {} |
| virtual ~ExitingErrorCode(); |
| protected: |
| virtual void handleFailure() const; |
| private: |
| const char *location; |
| }; |
| |
| ExitingErrorCode::~ExitingErrorCode() { |
| // Safe because our handleFailure() does not throw exceptions. |
| if(isFailure()) { handleFailure(); } |
| } |
| |
| void ExitingErrorCode::handleFailure() const { |
| fprintf(stderr, "error at %s: %s\n", location, errorName()); |
| exit(errorCode); |
| } |
| |
| static int |
| toIDNA2003(const UStringPrepProfile *prep, UChar32 c, icu::UnicodeString &destString) { |
| UChar src[2]; |
| int32_t srcLength=0; |
| U16_APPEND_UNSAFE(src, srcLength, c); |
| UChar *dest; |
| int32_t destLength; |
| dest=destString.getBuffer(32); |
| if(dest==NULL) { |
| return FALSE; |
| } |
| UErrorCode errorCode=U_ZERO_ERROR; |
| destLength=usprep_prepare(prep, src, srcLength, |
| dest, destString.getCapacity(), |
| USPREP_DEFAULT, NULL, &errorCode); |
| destString.releaseBuffer(destLength); |
| if(errorCode==U_STRINGPREP_PROHIBITED_ERROR) { |
| return -1; |
| } else { |
| // Returns FALSE=0 for U_STRINGPREP_UNASSIGNED_ERROR and processing errors, |
| // TRUE=1 if c is valid or mapped. |
| return U_SUCCESS(errorCode); |
| } |
| } |
| |
| enum Status { |
| DISALLOWED, IGNORED, MAPPED, DEVIATION, VALID, |
| DISALLOWED_STD3_VALID, DISALLOWED_STD3_MAPPED |
| }; |
| static const char *const statusNames[]={ |
| "disallowed", "ignored", "mapped", "deviation", "valid", |
| "disallowed_STD3_valid", "disallowed_STD3_mapped" |
| }; |
| |
| static void |
| printLine(UChar32 start, UChar32 end, Status status, const icu::UnicodeString &mapping) { |
| if(start==end) { |
| printf("%04lX ", (long)start); |
| } else { |
| printf("%04lX..%04lX ", (long)start, (long)end); |
| } |
| printf("; %s", statusNames[status]); |
| if(status==MAPPED || status==DEVIATION || !mapping.isEmpty()) { |
| printf(" ;"); |
| const UChar *buffer=mapping.getBuffer(); |
| int32_t length=mapping.length(); |
| int32_t i=0; |
| UChar32 c; |
| while(i<length) { |
| U16_NEXT(buffer, i, length, c); |
| printf(" %04lX", (long)c); |
| } |
| } |
| puts(""); |
| } |
| |
| static void |
| getAgeIfAssigned(UChar32 c, UVersionInfo age) { |
| if(u_isdefined(c)) { |
| u_charAge(c, age); |
| } else if(U_IS_UNICODE_NONCHAR(c)) { |
| age[0]=0; |
| age[1]=0; |
| age[2]=0; |
| age[3]=1; |
| } else { |
| memset(age, 0, 4); |
| } |
| } |
| |
| extern int |
| main(int argc, const char *argv[]) { |
| ExitingErrorCode errorCode("genuts46"); |
| |
| // predefined base sets |
| icu::UnicodeSet unassignedSet(UNICODE_STRING_SIMPLE("[:Cn:]"), errorCode); |
| |
| icu::UnicodeSet labelSeparators( |
| UNICODE_STRING_SIMPLE("[\\u002E\\u3002\\uFF0E\\uFF61]"), errorCode); |
| |
| icu::UnicodeSet mappedSet( |
| UNICODE_STRING_SIMPLE("[:Changes_When_NFKC_Casefolded:]"), errorCode); |
| mappedSet.removeAll(labelSeparators); // simplifies checking of mapped characters |
| |
| icu::UnicodeSet baseValidSet(icu::UnicodeString( |
| "[[[[:^Changes_When_NFKC_Casefolded:]" |
| "-[:C:]-[:Z:]" |
| "-[:Block=Ideographic_Description_Characters:]]" |
| "[:ascii:]]-[.]]", -1, US_INV), errorCode); |
| |
| // Characters that are disallowed when STD3 rules are applied, |
| // but valid when STD3 rules are not applied. |
| icu::UnicodeSet disallowedSTD3Set(icu::UnicodeString( |
| "[[:ascii:]-[\\u002D.a-zA-Z0-9]]", -1, US_INV), errorCode); |
| |
| icu::UnicodeSet deviationSet( |
| UNICODE_STRING_SIMPLE("[\\u00DF\\u03C2\\u200C\\u200D]"), errorCode); |
| errorCode.assertSuccess(); |
| |
| // derived sets |
| icu::LocalUStringPrepProfilePointer namePrep(usprep_openByType(USPREP_RFC3491_NAMEPREP, errorCode)); |
| const icu::Normalizer2 *nfkc_cf= |
| icu::Normalizer2::getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode); |
| errorCode.assertSuccess(); |
| |
| // HACK: The StringPrep API performs a BiDi check according to the data. |
| // We need to override that for this data generation, by resetting an internal flag. |
| namePrep->checkBiDi=FALSE; |
| |
| icu::UnicodeSet baseExclusionSet; |
| icu::UnicodeString cString, mapping, namePrepResult; |
| for(UChar32 c=0; c<=0x10ffff; ++c) { |
| if(c==0xd800) { |
| c=0xe000; |
| } |
| int namePrepStatus=toIDNA2003(namePrep.getAlias(), c, namePrepResult); |
| if(namePrepStatus!=0) { |
| // get the UTS #46 base mapping value |
| switch(c) { |
| case 0xff0e: |
| case 0x3002: |
| case 0xff61: |
| mapping.setTo(0x2e); |
| break; |
| default: |
| cString.setTo(c); |
| nfkc_cf->normalize(cString, mapping, errorCode); |
| break; |
| } |
| if( |
| namePrepStatus>0 ? |
| // c is valid or mapped in IDNA2003 |
| !labelSeparators.contains(c) && namePrepResult!=mapping : |
| // namePrepStatus<0: c is prohibited in IDNA2003 |
| baseValidSet.contains(c) || (cString!=mapping && baseValidSet.containsAll(mapping)) |
| ) { |
| baseExclusionSet.add(c); |
| } |
| } |
| } |
| |
| icu::UnicodeSet disallowedSet(0, 0x10ffff); |
| disallowedSet. |
| removeAll(labelSeparators). |
| removeAll(deviationSet). |
| removeAll(mappedSet). |
| removeAll(baseValidSet). |
| addAll(baseExclusionSet). |
| addAll(unassignedSet); |
| |
| const icu::Normalizer2 *nfd= |
| icu::Normalizer2::getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode); |
| errorCode.assertSuccess(); |
| |
| icu::UnicodeSet ignoredSet; // will be a subset of mappedSet |
| icu::UnicodeSet removeSet; |
| icu::UnicodeString nfdString; |
| { |
| icu::UnicodeSetIterator iter(mappedSet); |
| while(iter.next()) { |
| UChar32 c=iter.getCodepoint(); |
| cString.setTo(c); |
| nfkc_cf->normalize(cString, mapping, errorCode); |
| if(!baseValidSet.containsAll(mapping)) { |
| fprintf(stderr, "U+%04lX mapped -> disallowed: mapping not wholly in base valid set\n", (long)c); |
| disallowedSet.add(c); |
| removeSet.add(c); |
| } else if(mapping.isEmpty()) { |
| ignoredSet.add(c); |
| } |
| } |
| mappedSet.removeAll(removeSet); |
| } |
| errorCode.assertSuccess(); |
| |
| icu::UnicodeSet validSet(baseValidSet); |
| validSet. |
| removeAll(labelSeparators). // non-ASCII label separators will be mapped in the end |
| removeAll(deviationSet). |
| removeAll(disallowedSet). |
| removeAll(mappedSet). |
| add(0x2e); // not mapped, simply valid |
| UBool madeChange; |
| do { |
| madeChange=FALSE; |
| { |
| removeSet.clear(); |
| icu::UnicodeSetIterator iter(validSet); |
| while(iter.next()) { |
| UChar32 c=iter.getCodepoint(); |
| if(nfd->getDecomposition(c, nfdString) && !validSet.containsAll(nfdString)) { |
| fprintf(stderr, "U+%04lX valid -> disallowed: NFD not wholly valid\n", (long)c); |
| disallowedSet.add(c); |
| removeSet.add(c); |
| madeChange=TRUE; |
| } |
| } |
| validSet.removeAll(removeSet); |
| } |
| { |
| removeSet.clear(); |
| icu::UnicodeSetIterator iter(mappedSet); |
| while(iter.next()) { |
| UChar32 c=iter.getCodepoint(); |
| cString.setTo(c); |
| nfkc_cf->normalize(cString, mapping, errorCode); |
| nfd->normalize(mapping, nfdString, errorCode); |
| if(!validSet.containsAll(nfdString)) { |
| fprintf(stderr, "U+%04lX mapped -> disallowed: NFD of mapping not wholly valid\n", (long)c); |
| disallowedSet.add(c); |
| removeSet.add(c); |
| madeChange=TRUE; |
| } |
| } |
| mappedSet.removeAll(removeSet); |
| } |
| } while(madeChange); |
| errorCode.assertSuccess(); |
| |
| // finish up |
| labelSeparators.remove(0x2e).freeze(); // U+002E is simply valid |
| deviationSet.freeze(); |
| ignoredSet.freeze(); |
| validSet.freeze(); |
| mappedSet.freeze(); |
| disallowedSTD3Set.freeze(); |
| |
| // output |
| UChar32 prevStart=0, c=0; |
| Status prevStatus=DISALLOWED_STD3_VALID, status; |
| icu::UnicodeString prevMapping; |
| UVersionInfo prevAge={ 1, 1, 0, 0 }, age; |
| |
| icu::UnicodeSetIterator iter(disallowedSet); |
| while(iter.nextRange()) { |
| UChar32 start=iter.getCodepoint(); |
| while(c<start) { |
| mapping.remove(); |
| if(labelSeparators.contains(c)) { |
| status=MAPPED; |
| mapping.setTo(0x2e); |
| } else if(deviationSet.contains(c)) { |
| status=DEVIATION; |
| cString.setTo(c); |
| nfkc_cf->normalize(cString, mapping, errorCode); |
| } else if(ignoredSet.contains(c)) { |
| status=IGNORED; |
| } else if(validSet.contains(c)) { |
| if(disallowedSTD3Set.contains(c)) { |
| fprintf(stderr, "U+%04lX valid -> disallowed_STD3_valid: itself not STD3\n", (long)c); |
| status=DISALLOWED_STD3_VALID; |
| } else if( nfd->getDecomposition(c, nfdString) && |
| disallowedSTD3Set.containsSome(nfdString) |
| ) { |
| fprintf(stderr, "U+%04lX valid -> disallowed_STD3_valid: NFD not wholly STD3\n", (long)c); |
| status=DISALLOWED_STD3_VALID; |
| } else { |
| status=VALID; |
| } |
| } else if(mappedSet.contains(c)) { |
| cString.setTo(c); |
| nfkc_cf->normalize(cString, mapping, errorCode); |
| if(disallowedSTD3Set.containsSome(mapping)) { |
| fprintf(stderr, "U+%04lX mapped -> disallowed_STD3_mapped\n", (long)c); |
| status=DISALLOWED_STD3_MAPPED; |
| } else { |
| status=MAPPED; |
| } |
| } else { |
| fprintf(stderr, "*** undetermined status of U+%04lX\n", (long)c); |
| } |
| // Print a new line where the status, the mapping or |
| // the character age change. |
| getAgeIfAssigned(c, age); |
| if( prevStart<c && |
| (status!=prevStatus || mapping!=prevMapping || 0!=memcmp(prevAge, age, 4)) |
| ) { |
| printLine(prevStart, c-1, prevStatus, prevMapping); |
| prevStart=c; |
| prevStatus=status; |
| prevMapping=mapping; |
| memcpy(prevAge, age, 4); |
| } |
| ++c; |
| } |
| // c==start is disallowed |
| if(prevStart<c) { |
| printLine(prevStart, c-1, prevStatus, prevMapping); |
| } |
| prevStart=c; |
| prevStatus=DISALLOWED; |
| prevMapping.remove(); |
| getAgeIfAssigned(c, prevAge); |
| UChar32 end=iter.getCodepointEnd(); |
| while(++c<=end) { |
| getAgeIfAssigned(c, age); |
| if(prevStart<c && 0!=memcmp(prevAge, age, 4)) { |
| printLine(prevStart, c-1, prevStatus, prevMapping); |
| prevStart=c; |
| memcpy(prevAge, age, 4); |
| } |
| } |
| } |
| if(prevStart<c) { |
| printLine(prevStart, c-1, prevStatus, prevMapping); |
| } |
| return 0; |
| } |