|  | /* | 
|  | ******************************************************************************* | 
|  | * | 
|  | *   Copyright (C) 2000-2001, International Business Machines | 
|  | *   Corporation and others.  All Rights Reserved. | 
|  | * | 
|  | ******************************************************************************* | 
|  | *   file name:  rptp2ucm.c | 
|  | *   encoding:   US-ASCII | 
|  | *   tab size:   8 (not used) | 
|  | *   indentation:4 | 
|  | * | 
|  | *   created on: 2001feb16 | 
|  | *   created by: Markus W. Scherer | 
|  | * | 
|  | *   This tool reads two CDRA conversion table files (RPMAP & TPMAP or RXMAP and TXMAP) and | 
|  | *   generates a canonicalized ICU .ucm file from them. | 
|  | *   If the RPMAP/RXMAP file does not contain a comment line with the substitution character, | 
|  | *   then this tool also attempts to read the header of the corresponding UPMAP/UXMAP file | 
|  | *   to extract subchar and subchar1. | 
|  | * | 
|  | *   R*MAP: Unicode->codepage | 
|  | *   T*MAP: codepage->Unicode | 
|  | * | 
|  | *   To compile, just call a C compiler/linker with this source file. | 
|  | *   On Windows: cl rptp2ucm.c | 
|  | */ | 
|  |  | 
|  | #include <stdio.h> | 
|  | #include <stdlib.h> | 
|  | #include <string.h> | 
|  | #include <time.h> | 
|  |  | 
|  | typedef struct UCMSubchar { | 
|  | const char *name; | 
|  | unsigned long subchar, subchar1; | 
|  | } UCMSubchar; | 
|  |  | 
|  | static const UCMSubchar | 
|  | knownSubchars[]={ | 
|  | "274_P100", 0x3f, 0, | 
|  | "850_P100", 0x7f, 0, | 
|  | "913_P100", 0x1a, 0, | 
|  | "1047_P100", 0x3f, 0 | 
|  | }; | 
|  |  | 
|  | typedef struct CCSIDStateTable { | 
|  | unsigned int ccsid; | 
|  | const char *table; | 
|  | } CCSIDStateTable; | 
|  |  | 
|  | /*Year when the ucm files were produced using this tool*/ | 
|  | #define YEAR "2000" | 
|  | /**/ | 
|  | #define japanesePCDBCSStates  "<icu:state>                   0-ff:2, 81-9f:1, a0-fc:1\n"\ | 
|  | "<icu:state>                   40-7e, 80-fc\n"\ | 
|  | "<icu:state>\n" | 
|  |  | 
|  | static const CCSIDStateTable | 
|  | knownStateTables[]={ | 
|  |  | 
|  | 301,  "<icu:state>                   0-ff:2, 81-9f:1, e0-fc:1\n" | 
|  | "<icu:state>                   40-7e, 80-fc\n" | 
|  | "<icu:state>\n", | 
|  | 367,   "<icu:state>                   0-7f\n", | 
|  |  | 
|  | 927, japanesePCDBCSStates, | 
|  |  | 
|  | 926, japanesePCDBCSStates, | 
|  |  | 
|  | 928, japanesePCDBCSStates, | 
|  |  | 
|  | 932, "<icu:state>                   0-7f,80,81-9f:1,a0-df,fd-ff, e0-fc:1\n" | 
|  | "<icu:state>                   40-7e, 80-fc\n", | 
|  |  | 
|  |  | 
|  | 941,  japanesePCDBCSStates, | 
|  |  | 
|  | 942,   "<icu:state>                   0-80, 81-9f:1, a0-df, e0-fc:1, fd-ff\n" | 
|  | "<icu:state>                   40-7e, 80-fc\n", | 
|  |  | 
|  | 943,   "<icu:state>                   0-7f, 81-9f:1, a0-df, e0-fc:1\n" | 
|  | "<icu:state>                   40-7e, 80-fc\n", | 
|  |  | 
|  | 944,   "<icu:state>                   0-80, 81-bf:1, c0-ff\n" | 
|  | "<icu:state>                   40-7e, 80-fe\n", | 
|  |  | 
|  | 946,   "<icu:state>                   0-80, 81-fb:1,fc:2,fd-ff\n" | 
|  | "<icu:state>                   40-7e, 80-fe\n" | 
|  | "<icu:state>                   80-fe.u,fc", | 
|  |  | 
|  | 947,   "<icu:state>                   0-7f, 80-fe:1\n" | 
|  | "<icu:state>                   40-7e, 80-fe\n", | 
|  |  | 
|  | 948,   "<icu:state>                   0-80, 81-fb:1,fc:2,fd-fe\n" | 
|  | "<icu:state>                   40-7e, 80-fe\n" | 
|  | "<icu:state>                   80-fe.u,fc\n", | 
|  |  | 
|  | 949,   "<icu:state>                   0-84, 8f-fe:1\n" | 
|  | "<icu:state>                   40-7e, 80-fe\n", | 
|  |  | 
|  | 950,   "<icu:state>                   0-7f, 81-fe:1\n" | 
|  | "<icu:state>                   40-7e, 81-fe\n", | 
|  |  | 
|  | 954,   "<icu:state>                   0-8d, 8e:2, 8f:3, 90-9f, a1-fe:1\n" | 
|  | "<icu:state>                   a1-fe\n" | 
|  | "<icu:state>                   a1-e4\n" | 
|  | "<icu:state>                   a1-fe:1, a1:4\n" | 
|  | "<icu:state>                   a1-fe.u\n", | 
|  |  | 
|  | 955,   "<icu:state>                   0-20:2, 21-7e:1, 7f-ff:2\n" | 
|  | "<icu:state>                   21-7e\n" | 
|  | "<icu:state>\n", | 
|  |  | 
|  | 963,   "<icu:state>                   0-20:2, 21-7e:1, 7f-ff:2\n" | 
|  | "<icu:state>                   21-7e\n" | 
|  | "<icu:state>\n", | 
|  |  | 
|  | 964,   "<icu:state>                   0-8d, 8e:2, 90-9f, a1-fe:1, aa-c1:5, c3:5, fe:5\n" | 
|  | "<icu:state>                   a1-fe\n" | 
|  | "<icu:state>                   a1-b0:3, a1:4, a2:8, a3-ab:4, ac:7, ad:6, ae-b0:4\n" | 
|  | "<icu:state>                   a1-fe:1\n" | 
|  | "<icu:state>                   a1-fe:5\n" | 
|  | "<icu:state>                   a1-fe.u\n" | 
|  | "<icu:state>                   a1-a4:1, a5-fe:5\n" | 
|  | "<icu:state>                   a1-e2:1, e3-fe:5\n" | 
|  | "<icu:state>                   a1-f2:1, f3-fe:5\n", | 
|  |  | 
|  | 970,   "<icu:state>                   0-9f, a1-fe:1\n" | 
|  | "<icu:state>                   a1-fe\n", | 
|  |  | 
|  | 1363,  "<icu:state>                   0-7f, 81-fe:1\n" | 
|  | "<icu:state>                   40-7e, 80-fe\n", | 
|  | 1350,  "<icu:state>                   0-8d, 8e:2, 8f:3, 90-9f, a1-fe:1\n" | 
|  | "<icu:state>                   a1-fe\n" | 
|  | "<icu:state>                   a1-e4\n" | 
|  | "<icu:state>                   a1-fe:1, a1:4, a3-a5:4, a8:4, ac-af:4, ee-f2:4\n" | 
|  | "<icu:state>                   a1-fe.u\n", | 
|  |  | 
|  | 1351,  "<icu:state>                   0-ff:2, 81-9f:1, e0-fc:1\n" | 
|  | "<icu:state>                   40-7e, 80-fc\n" | 
|  | "<icu:state>\n", | 
|  |  | 
|  | 1370,  "<icu:state>                   0-80, 81-fe:1\n" | 
|  | "<icu:state>                   40-7e, 81-fe\n", | 
|  |  | 
|  | 1381,  "<icu:state>                   0-84, 8c-fe:1\n" | 
|  | "<icu:state>                   a1-fe\n", | 
|  |  | 
|  | 1383,  "<icu:state>                   0-9f, a1-fe:1\n" | 
|  | "<icu:state>                   a1-fe\n", | 
|  |  | 
|  | 1385,  "<icu:state>                   0-ff:2,81-fe:1\n" | 
|  | "<icu:state>                   40-7e, 80-fe\n" | 
|  | "<icu:state>\n", | 
|  |  | 
|  | 1386,  "<icu:state>                   0-7f, 81-fe:1\n" | 
|  | "<icu:state>                   40-7e, 80-fe\n", | 
|  |  | 
|  | 5039,   "<icu:state>                   0-80, 81-9f:1, a0-df, e0-fc:1, fd-ff\n" | 
|  | "<icu:state>                   40-7e, 80-fc\n", | 
|  |  | 
|  | 5050,  "<icu:state>                   0-8d, 8e:2, 8f:3, 90-9f, a1-fe:1\n" | 
|  | "<icu:state>                   a1-fe\n" | 
|  | "<icu:state>                   a1-e4\n" | 
|  | "<icu:state>                   a1-fe:1, a1:4, a3-af:4, b6:4, d6:4, da-db:4, ed-f2:4\n" | 
|  | "<icu:state>                   a1-fe.u\n", | 
|  | 5067,  "<icu:state>                   0-ff:2, 21-7e:1\n" | 
|  | "<icu:state>                   21-7e\n" | 
|  | "<icu:state>\n", | 
|  |  | 
|  | 5478,  "<icu:state>                   0-ff:2, 21-7e:1\n" | 
|  | "<icu:state>                   21-7e\n" | 
|  | "<icu:state>\n", | 
|  |  | 
|  | 21427, "<icu:state>                   0-80:2, 81-fe:1, ff:2\n" | 
|  | "<icu:state>                   40-7e, 80-fe\n" | 
|  | "<icu:state>\n", | 
|  | 25546, "<icu:state>                   0-7f, e:1.s, f:0.s\n" | 
|  | "<icu:state>                   initial, 0-20:3, e:1.s, f:0.s, 21-7e:2, 7f-ff:3\n" | 
|  | "<icu:state>                   0-20:1.i, 21-7e:1., 7f-ff:1.i\n" | 
|  | "<icu:state>                   0-ff:1.i\n", | 
|  |  | 
|  | 33722, "<icu:state>                   0-8d, 8e:2, 8f:3, 90-9f, a1-fe:1\n" | 
|  | "<icu:state>                   a1-fe\n" | 
|  | "<icu:state>                   a1-e4\n" | 
|  | "<icu:state>                   a1-fe:1, a1:4, a3-af:4, b6:4, d6:4, da-db:4, ed-f2:4\n" | 
|  | "<icu:state>                   a1-fe.u\n" | 
|  |  | 
|  |  | 
|  | }; | 
|  |  | 
|  | typedef struct Mapping { | 
|  | /* | 
|  | * u bits: | 
|  | * 31..24  fallback indicator | 
|  | *         0  roundtrip | 
|  | *         1  Unicode->codepage | 
|  | *         3  codepage->Unicode | 
|  | * 23.. 0  Unicode code point | 
|  | * | 
|  | * b: codepage bytes with leading zeroes | 
|  | */ | 
|  | unsigned long u, b; | 
|  | } Mapping; | 
|  |  | 
|  | #define MAX_MAPPINGS_COUNT 200000 | 
|  |  | 
|  | static Mapping | 
|  | fromUMappings[MAX_MAPPINGS_COUNT], toUMappings[MAX_MAPPINGS_COUNT]; | 
|  |  | 
|  | static long fromUMappingsTop, toUMappingsTop; | 
|  |  | 
|  | static unsigned long subchar, subchar1; | 
|  | static unsigned int ccsid; | 
|  |  | 
|  | enum { | 
|  | ASCII, | 
|  | EBCDIC, | 
|  | UNKNOWN | 
|  | }; | 
|  |  | 
|  | static char | 
|  | minCharLength, | 
|  | maxCharLength, | 
|  | charsetFamily, | 
|  | usesPUA, | 
|  | variantLF, | 
|  | variantASCII, | 
|  | variantControls, | 
|  | variantSUB, | 
|  | is7Bit; | 
|  |  | 
|  | static void | 
|  | init() { | 
|  | fromUMappingsTop=toUMappingsTop=0; | 
|  |  | 
|  | subchar=subchar1=0; | 
|  | ccsid=0; | 
|  |  | 
|  | minCharLength=4; | 
|  | maxCharLength=0; | 
|  | charsetFamily=UNKNOWN; | 
|  | usesPUA=0; | 
|  | variantLF=0; | 
|  | variantASCII=0; | 
|  | variantControls=0; | 
|  | variantSUB=0; | 
|  | is7Bit=0; | 
|  | } | 
|  |  | 
|  | /* lexically compare Mappings for sorting */ | 
|  | static int | 
|  | compareMappings(const void *left, const void *right) { | 
|  | const Mapping *l=(const Mapping *)left, *r=(const Mapping *)right; | 
|  | long result; | 
|  |  | 
|  | /* the code points use fewer than 32 bits, just cast them to signed values and subtract */ | 
|  | result=(long)(l->u&0xffffff)-(long)(r->u&0xffffff); | 
|  | if(result!=0) { | 
|  | /* shift right 16 with sign-extend to take care of int possibly being 16 bits wide */ | 
|  | return (int)(result>>16)|1; | 
|  | } | 
|  |  | 
|  | /* the b fields may use all 32 bits as unsigned long, so result=(long)(l->b-r->b) would not work (try l->b=0x80000000 and r->b=1) */ | 
|  | if(l->b<r->b) { | 
|  | return -1; | 
|  | } else if(l->b>r->b) { | 
|  | return 1; | 
|  | } | 
|  |  | 
|  | return (int)(l->u>>24)-(int)(r->u>>24); | 
|  | } | 
|  |  | 
|  | static const char * | 
|  | skipWhitespace(const char *s) { | 
|  | while(*s==' ' || *s=='\t') { | 
|  | ++s; | 
|  | } | 
|  | return s; | 
|  | } | 
|  |  | 
|  | static long | 
|  | parseMappings(FILE *f, Mapping *mappings) { | 
|  | char line[200]; | 
|  | Mapping *oldMappings; | 
|  | char *s, *end; | 
|  | long mappingsTop=0; | 
|  |  | 
|  | oldMappings=mappings; | 
|  | while(fgets(line, sizeof(line), f)!=NULL) { | 
|  | s=(char *)skipWhitespace(line); | 
|  |  | 
|  | /* skip empty lines */ | 
|  | if(*s==0 || *s=='\n' || *s=='\r') { | 
|  | continue; | 
|  | } | 
|  |  | 
|  | /* explicit end of table */ | 
|  | if(memcmp(s, "END CHARMAP", 11)==0) { | 
|  | break; | 
|  | } | 
|  |  | 
|  | /* comment lines, parse substitution characters, otherwise skip them */ | 
|  | if(*s=='#' || *s=='*') { | 
|  | /* get subchar1 */ | 
|  | s=strstr(line, "for U+00xx"); | 
|  | if(s!=NULL) { | 
|  | s=strstr(line, "x'"); | 
|  | if(s!=NULL) { | 
|  | s+=2; | 
|  | subchar1=strtoul(s, &end, 16); | 
|  | if(end!=s+2 || *end!='\'') { | 
|  | fprintf(stderr, "error parsing subchar1 from \"%s\"\n", line); | 
|  | exit(2); | 
|  | } | 
|  | continue; | 
|  | } else { | 
|  | fprintf(stderr, "error finding subchar1 on \"%s\"\n", line); | 
|  | exit(2); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* get subchar */ | 
|  | s=strstr(line, "for U+xxxx"); | 
|  | if(s!=NULL) { | 
|  | s=strstr(line, "x'"); | 
|  | if(s!=NULL) { | 
|  | s+=2; | 
|  | subchar=strtoul(s, &end, 16); | 
|  | if(end<s+2 || *end!='\'') { | 
|  | fprintf(stderr, "error parsing subchar from \"%s\"\n", line); | 
|  | exit(2); | 
|  | } | 
|  | continue; | 
|  | } else { | 
|  | fprintf(stderr, "error finding subchar on \"%s\"\n", line); | 
|  | exit(2); | 
|  | } | 
|  | } | 
|  |  | 
|  | continue; | 
|  | } | 
|  |  | 
|  | mappings->b=strtoul(s, &end, 16); | 
|  | if(s==end || (*end!=' ' && *end!='\t')) { | 
|  | if((s+1)==end && *end=='-' && (mappings->b<=3)) { | 
|  | /* this is a special EUC format where the code set number prepends the bytes */ | 
|  | unsigned long prefix; | 
|  |  | 
|  | switch(mappings->b) { | 
|  | case 0: | 
|  | prefix=0; | 
|  | break; | 
|  | case 1: | 
|  | prefix=0; | 
|  | break; | 
|  | case 2: | 
|  | prefix=0x8e; | 
|  | break; | 
|  | case 3: | 
|  | prefix=0x8f; | 
|  | break; | 
|  | default: | 
|  | /* never occurs because of above check */ | 
|  | break; | 
|  | } | 
|  |  | 
|  | s+=2; | 
|  | mappings->b=strtoul(s, &end, 16); | 
|  | if(s==end || ((end-s)&1) || (*end!=' ' && *end!='\t')) { | 
|  | fprintf(stderr, "error parsing EUC codepage bytes on \"%s\"\n", line); | 
|  | exit(2); | 
|  | } | 
|  | mappings->b|=prefix<<(4*(end-s)); | 
|  | } else { | 
|  | fprintf(stderr, "error parsing codepage bytes on \"%s\"\n", line); | 
|  | exit(2); | 
|  | } | 
|  | } | 
|  |  | 
|  | s=(char *)skipWhitespace(end); | 
|  | mappings->u=strtoul(s, &end, 16); | 
|  | if(s==end || (*end!=' ' && *end!='\t' && *end!='\n' && *end!='\r' && *end!=0)) { | 
|  | if(strncmp(s, "????", 4)==0 || strstr(s, "UNASSIGNED")!=NULL) { | 
|  | /* this is a non-entry, do not add it to the mapping table */ | 
|  | continue; | 
|  | } | 
|  | fprintf(stderr, "error parsing Unicode code point on \"%s\"\n", line); | 
|  | exit(2); | 
|  | } | 
|  |  | 
|  | ++mappings; | 
|  | if(++mappingsTop>=MAX_MAPPINGS_COUNT) { | 
|  | fprintf(stderr, "error: too many mappings at \"%s\"\n", line); | 
|  | exit(2); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* sort the mappings */ | 
|  | qsort(oldMappings, mappingsTop, sizeof(Mapping), compareMappings); | 
|  |  | 
|  | return mappingsTop; | 
|  | } | 
|  |  | 
|  | /* merge the mappings into fromUMappings and add fallback indicator values to Mapping.u bits 31..24 */ | 
|  | static void | 
|  | mergeMappings() { | 
|  | long fromUIndex, toUIndex, newFromUMappingsTop=fromUMappingsTop; | 
|  | int cmp; | 
|  |  | 
|  | fromUIndex=toUIndex=0; | 
|  | while(fromUIndex<fromUMappingsTop && toUIndex<toUMappingsTop) { | 
|  | cmp=compareMappings(fromUMappings+fromUIndex, toUMappings+toUIndex); | 
|  | if(cmp==0) { | 
|  | /* equal: roundtrip, nothing to do */ | 
|  | ++fromUIndex; | 
|  | ++toUIndex; | 
|  | } else if(cmp<0) { | 
|  | /* | 
|  | * the fromU mapping does not have a toU counterpart: | 
|  | * fallback Unicode->codepage | 
|  | */ | 
|  | if(fromUMappings[fromUIndex].b!=subchar && fromUMappings[fromUIndex].b!=subchar1) { | 
|  | fromUMappings[fromUIndex++].u|=0x1000000; | 
|  | } else { | 
|  | fromUMappings[fromUIndex++].u|=0x2000000; | 
|  | } | 
|  | } else { | 
|  | /* | 
|  | * the toU mapping does not have a fromU counterpart: | 
|  | * (reverse) fallback codepage->Unicode, copy it to the fromU table | 
|  | */ | 
|  | fromUMappings[newFromUMappingsTop].u=toUMappings[toUIndex].u|=0x3000000; | 
|  | fromUMappings[newFromUMappingsTop++].b=toUMappings[toUIndex++].b; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* either one or both tables are exhausted */ | 
|  | while(fromUIndex<fromUMappingsTop) { | 
|  | /* leftover fromU mappings are fallbacks */ | 
|  | if(fromUMappings[fromUIndex].b!=subchar && fromUMappings[fromUIndex].b!=subchar1) { | 
|  | fromUMappings[fromUIndex++].u|=0x1000000; | 
|  | } else { | 
|  | fromUMappings[fromUIndex++].u|=0x2000000; | 
|  | } | 
|  | } | 
|  |  | 
|  | while(toUIndex<toUMappingsTop) { | 
|  | /* leftover toU mappings are reverse fallbacks */ | 
|  | fromUMappings[newFromUMappingsTop].u=toUMappings[toUIndex].u|=0x3000000; | 
|  | fromUMappings[newFromUMappingsTop++].b=toUMappings[toUIndex++].b; | 
|  | } | 
|  |  | 
|  | fromUMappingsTop=newFromUMappingsTop; | 
|  |  | 
|  | /* re-sort the mappings */ | 
|  | qsort(fromUMappings, fromUMappingsTop, sizeof(Mapping), compareMappings); | 
|  | } | 
|  |  | 
|  | static void | 
|  | analyzeTable() { | 
|  | unsigned long u, b, f, minTwoByte=0xffff, maxTwoByte=0, oredBytes=0; | 
|  | long i, countASCII=0; | 
|  | char length; | 
|  |  | 
|  | for(i=0; i<fromUMappingsTop; ++i) { | 
|  | f=fromUMappings[i].u>>24; | 
|  | u=fromUMappings[i].u&0xffffff; | 
|  | b=fromUMappings[i].b; | 
|  |  | 
|  | oredBytes|=b; | 
|  |  | 
|  | /* character length? */ | 
|  | if(b<=0xff) { | 
|  | length=1; | 
|  | } else if(b<=0xffff) { | 
|  | length=2; | 
|  | if(b<minTwoByte) { | 
|  | minTwoByte=b; | 
|  | } | 
|  | if(b>maxTwoByte) { | 
|  | maxTwoByte=b; | 
|  | } | 
|  | } else if(b<=0xffffff) { | 
|  | length=3; | 
|  | } else { | 
|  | length=4; | 
|  | } | 
|  | if(length<minCharLength) { | 
|  | minCharLength=length; | 
|  | } | 
|  | if(length>maxCharLength) { | 
|  | maxCharLength=length; | 
|  | } | 
|  |  | 
|  | /* PUA used? */ | 
|  | if((unsigned long)(u-0xe000)<0x1900 || (unsigned long)(u-0xf0000)<0x20000) { | 
|  | usesPUA=1; | 
|  | } | 
|  |  | 
|  | /* only consider roundtrip mappings for the rest */ | 
|  | if(f!=0) { | 
|  | continue; | 
|  | } | 
|  |  | 
|  | /* ASCII or EBCDIC? */ | 
|  | if(u==0x41) { | 
|  | if(b==0x41) { | 
|  | charsetFamily=ASCII; | 
|  | } else if(b==0xc1) { | 
|  | charsetFamily=EBCDIC; | 
|  | } | 
|  | } else if(u==0xa) { | 
|  | if(b==0xa) { | 
|  | charsetFamily=ASCII; | 
|  | } else if(b==0x25) { | 
|  | charsetFamily=EBCDIC; | 
|  | variantLF=0; | 
|  | } else if(b==0x15) { | 
|  | charsetFamily=EBCDIC; | 
|  | variantLF=1; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* US-ASCII? */ | 
|  | if((unsigned long)(u-0x21)<94) { | 
|  | if(u==b) { | 
|  | ++countASCII; | 
|  | } else { | 
|  | variantASCII=1; | 
|  | } | 
|  | } else if(u<0x20 || u==0x7f) { | 
|  | /* non-ISO C0 controls? */ | 
|  | if(u!=b) { | 
|  | /* IBM PC rotation of SUB and other controls: 0x1a->0x7f->0x1c->0x1a */ | 
|  | if(u==0x1a && b==0x7f || u==0x1c && b==0x1a || u==0x7f && b==0x1c) { | 
|  | charsetFamily=ASCII; | 
|  | variantSUB=1; | 
|  | } else { | 
|  | variantControls=1; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | is7Bit= oredBytes<=0x7f; | 
|  |  | 
|  | if(charsetFamily==UNKNOWN) { | 
|  | if(minCharLength==2 && maxCharLength==2) { | 
|  | /* guess the charset family for DBCS according to typical byte distributions */ | 
|  | if( ((0x2020<=minTwoByte || minTwoByte<=0x217e) && maxTwoByte<=0x7e7e) || | 
|  | ((0xa0a0<=minTwoByte || minTwoByte<=0xa1fe) && maxTwoByte<=0xfefe) || | 
|  | ((0x8140<=minTwoByte || minTwoByte<=0x81fe) && maxTwoByte<=0xfefe) | 
|  | ) { | 
|  | charsetFamily=ASCII; | 
|  | } else if((minTwoByte==0x4040 || (0x4141<=minTwoByte && minTwoByte<=0x41fe)) && maxTwoByte<=0xfefe) { | 
|  | charsetFamily=EBCDIC; | 
|  | } | 
|  | } | 
|  | if(charsetFamily==UNKNOWN) { | 
|  | fprintf(stderr, "error: unable to determine the charset family\n"); | 
|  | exit(3); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* reset variant indicators if they do not apply */ | 
|  | if(charsetFamily!=ASCII || minCharLength!=1) { | 
|  | variantASCII=variantSUB=variantControls=0; | 
|  | } else if(countASCII!=94) { | 
|  | /* if there are not 94 mappings for ASCII graphic characters, then set variantASCII */ | 
|  | variantASCII=1; | 
|  | } | 
|  |  | 
|  | if(charsetFamily!=EBCDIC || minCharLength!=1) { | 
|  | variantLF=0; | 
|  | } | 
|  | } | 
|  |  | 
|  | static int | 
|  | getSubchar(const char *name) { | 
|  | int i; | 
|  |  | 
|  | for(i=0; i<sizeof(knownSubchars)/sizeof(knownSubchars[0]); ++i) { | 
|  | if(strcmp(name, knownSubchars[i].name)==0) { | 
|  | subchar=knownSubchars[i].subchar; | 
|  | subchar1=knownSubchars[i].subchar1; | 
|  | return 1; | 
|  | } | 
|  | } | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static void | 
|  | getSubcharFromUPMAP(FILE *f) { | 
|  | char line[200]; | 
|  | char *s, *end; | 
|  | unsigned long *p; | 
|  | unsigned long value, bytes; | 
|  |  | 
|  | while(fgets(line, sizeof(line), f)!=NULL && memcmp(line, "CHARMAP", 7)!=0) { | 
|  | s=(char *)skipWhitespace(line); | 
|  |  | 
|  | /* skip empty lines */ | 
|  | if(*s==0 || *s=='\n' || *s=='\r') { | 
|  | continue; | 
|  | } | 
|  |  | 
|  | /* look for variations of subchar entries */ | 
|  | if(memcmp(s, "<subchar>", 9)==0) { | 
|  | s=(char *)skipWhitespace(s+9); | 
|  | p=&subchar; | 
|  | } else if(memcmp(s, "<subchar1>", 10)==0) { | 
|  | s=(char *)skipWhitespace(s+10); | 
|  | p=&subchar1; | 
|  | } else if(memcmp(s, "#<subchar1>", 11)==0) { | 
|  | s=(char *)skipWhitespace(s+11); | 
|  | p=&subchar1; | 
|  | } else { | 
|  | continue; | 
|  | } | 
|  |  | 
|  | /* get the value and store it in *p */ | 
|  | bytes=0; | 
|  | while(s[0]=='\\' && s[1]=='x') { | 
|  | value=strtoul(s+2, &end, 16); | 
|  | s+=4; | 
|  | if(end!=s) { | 
|  | fprintf(stderr, "error parsing UPMAP subchar from \"%s\"\n", line); | 
|  | exit(2); | 
|  | } | 
|  | bytes=(bytes<<8)|value; | 
|  | } | 
|  | *p=bytes; | 
|  | } | 
|  | } | 
|  |  | 
|  | static const char * | 
|  | getStateTable() { | 
|  | int i; | 
|  |  | 
|  | for(i=0; i<sizeof(knownStateTables)/sizeof(knownStateTables[0]); ++i) { | 
|  | if(ccsid==knownStateTables[i].ccsid) { | 
|  | return knownStateTables[i].table; | 
|  | } | 
|  | } | 
|  |  | 
|  | return NULL; | 
|  | } | 
|  |  | 
|  | static void | 
|  | writeBytes(char *s, unsigned long b) { | 
|  | if(b<=0xff) { | 
|  | sprintf(s, "\\x%02lX", b); | 
|  | } else if(b<=0xffff) { | 
|  | sprintf(s, "\\x%02lX\\x%02lX", b>>8, b&0xff); | 
|  | } else if(b<=0xffffff) { | 
|  | sprintf(s, "\\x%02lX\\x%02lX\\x%02lX", b>>16, (b>>8)&0xff, b&0xff); | 
|  | } else { | 
|  | sprintf(s, "\\x%02lX\\x%02lX\\x%02lX\\x%02lX", b>>24, (b>>16)&0xff, (b>>8)&0xff, b&0xff); | 
|  | } | 
|  | } | 
|  |  | 
|  | static void | 
|  | writeUCM(FILE *f, const char *ucmname, const char *rpname, const char *tpname) { | 
|  | char buffer[100]; | 
|  | const char *s; | 
|  | long i; | 
|  |  | 
|  | /* write the header */ | 
|  | fprintf(f, | 
|  | "# *******************************************************************************\n" | 
|  | "# *\n" | 
|  | "# *   Copyright (C) 1995-2001, International Business Machines\n" | 
|  | "# *   Corporation and others.  All Rights Reserved.\n" | 
|  | "# *\n" | 
|  | "# *******************************************************************************\n" | 
|  | "#\n" | 
|  | "# File created by rptp2ucm (compiled on %s)\n" | 
|  | "# from source files %s and %s\n" | 
|  | "#\n", __DATE__, rpname, tpname); | 
|  |  | 
|  | /* ucmname does not have a path or .ucm */ | 
|  | fprintf(f, "<code_set_name>               \"%s\"\n", ucmname); | 
|  |  | 
|  | fputs("<char_name_mask>              \"AXXXX\"\n", f); | 
|  | fprintf(f, "<mb_cur_max>                  %u\n", maxCharLength); | 
|  | fprintf(f, "<mb_cur_min>                  %u\n", minCharLength); | 
|  |  | 
|  | if(maxCharLength==1) { | 
|  | fputs("<uconv_class>                 \"SBCS\"\n", f); | 
|  | } else if(maxCharLength==2) { | 
|  | if(minCharLength==1) { | 
|  | if(charsetFamily==EBCDIC) { | 
|  | fputs("<uconv_class>                 \"EBCDIC_STATEFUL\"\n", f); | 
|  | } else { | 
|  | fputs("<uconv_class>                 \"MBCS\"\n", f); | 
|  | } | 
|  | } else if(minCharLength==2) { | 
|  | fputs("<uconv_class>                 \"DBCS\"\n", f); | 
|  | } else { | 
|  | fputs("<uconv_class>                 \"MBCS\"\n", f); | 
|  | } | 
|  | } else { | 
|  | fputs("<uconv_class>                 \"MBCS\"\n", f); | 
|  | } | 
|  |  | 
|  | if(subchar!=0) { | 
|  | writeBytes(buffer, subchar); | 
|  | fprintf(f, "<subchar>                     %s\n", buffer); | 
|  | } | 
|  |  | 
|  | if(subchar1!=0) { | 
|  | fprintf(f, "<subchar1>                    \\x%02X\n", subchar1); | 
|  | } | 
|  |  | 
|  | /* write charset family */ | 
|  | if(charsetFamily==ASCII) { | 
|  | fputs("<icu:charsetFamily>           \"ASCII\"\n", f); | 
|  | } else { | 
|  | fputs("<icu:charsetFamily>           \"EBCDIC\"\n", f); | 
|  | } | 
|  |  | 
|  | /* write alias describing the codepage */ | 
|  | sprintf(buffer, "<icu:alias>                   \"ibm-%u", ccsid); | 
|  | if(!usesPUA && !variantLF && !variantASCII && !variantControls && !variantSUB) { | 
|  | strcat(buffer, "_STD\"\n\n"); | 
|  | } else { | 
|  | /* add variant indicators in alphabetic order */ | 
|  | if(variantASCII) { | 
|  | strcat(buffer, "_VASCII"); | 
|  | } | 
|  | if(variantControls) { | 
|  | strcat(buffer, "_VGCTRL"); | 
|  | } | 
|  | if(variantLF) { | 
|  | strcat(buffer, "_VLF"); | 
|  | } | 
|  | if(variantSUB) { | 
|  | strcat(buffer, "_VSUB"); | 
|  | } | 
|  | if(usesPUA) { | 
|  | strcat(buffer, "_VPUA"); | 
|  | } | 
|  | strcat(buffer, "\"\n\n"); | 
|  | } | 
|  | fputs(buffer, f); | 
|  |  | 
|  | /* write the state table - <icu:state> */ | 
|  | s=getStateTable(); | 
|  | if(s!=NULL) { | 
|  | fputs(s, f); | 
|  | fputs("\n", f); | 
|  | } else if(is7Bit) { | 
|  | fputs("<icu:state>                   0-7f\n\n", f); | 
|  | } | 
|  |  | 
|  | /* write the mappings */ | 
|  | fputs("CHARMAP\n", f); | 
|  | for(i=0; i<fromUMappingsTop; ++i) { | 
|  | writeBytes(buffer, fromUMappings[i].b); | 
|  | fprintf(f, "<U%04lX> %s |%lu\n", fromUMappings[i].u&0xffffff, buffer, fromUMappings[i].u>>24); | 
|  | } | 
|  | fputs("END CHARMAP\n", f); | 
|  | } | 
|  |  | 
|  | static void | 
|  | processTable(const char *arg) { | 
|  | char filename[1024], tpname[32]; | 
|  | const char *basename, *s; | 
|  | FILE *rpmap, *tpmap, *ucm; | 
|  | unsigned long value, unicode; | 
|  | int length; | 
|  |  | 
|  | init(); | 
|  |  | 
|  | /* separate path and basename */ | 
|  | basename=strrchr(arg, '/'); | 
|  | if(basename==NULL) { | 
|  | basename=strrchr(arg, '\\'); | 
|  | if(basename==NULL) { | 
|  | basename=arg; | 
|  | } else { | 
|  | ++basename; | 
|  | } | 
|  | } else { | 
|  | ++basename; | 
|  | s=strrchr(arg, '\\'); | 
|  | if(s!=NULL && ++s>basename) { | 
|  | basename=s; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* is this a standard RPMAP filename? */ | 
|  | value=strtoul(basename, (char **)&s, 16); | 
|  | if( strlen(basename)!=17 || | 
|  | (memcmp(basename+9, "RPMAP", 5)!=0 && memcmp(basename+9, "rpmap", 5)!=0 && | 
|  | memcmp(basename+9, "RXMAP", 5)!=0 && memcmp(basename+9, "rxmap", 5)!=0) || | 
|  | (s-basename)!=8 || | 
|  | *s!='.' | 
|  | ) { | 
|  | fprintf(stderr, "error: \"%s\" is not a standard RPMAP filename\n", basename); | 
|  | exit(1); | 
|  | } | 
|  |  | 
|  | /* is this really a Unicode conversion table? - get the CCSID */ | 
|  | unicode=value&0xffff; | 
|  | if(unicode==13488 || unicode==17584) { | 
|  | ccsid=(unsigned int)(value>>16); | 
|  | } else { | 
|  | unicode=value>>16; | 
|  | if(unicode==13488 || unicode==17584) { | 
|  | ccsid=(unsigned int)(value&0xffff); | 
|  | } else { | 
|  | fprintf(stderr, "error: \"%s\" is not a Unicode conversion table\n", basename); | 
|  | exit(1); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* try to open the RPMAP file */ | 
|  | rpmap=fopen(arg, "r"); | 
|  | if(rpmap==NULL) { | 
|  | fprintf(stderr, "error: unable to open \"%s\"\n", arg); | 
|  | exit(1); | 
|  | } | 
|  |  | 
|  | /* try to open the TPMAP file */ | 
|  | strcpy(filename, arg); | 
|  | length=strlen(filename); | 
|  |  | 
|  | /* guess the TPMAP filename; note that above we have checked the format of the basename */ | 
|  | /* replace the R in RPMAP by T, keep upper- or lowercase */ | 
|  | if(filename[length-8]=='R') { | 
|  | filename[length-8]='T'; | 
|  | } else { | 
|  | filename[length-8]='t'; | 
|  | } | 
|  |  | 
|  | /* reverse the CCSIDs */ | 
|  | memcpy(filename+length-17, basename+4, 4); | 
|  | memcpy(filename+length-13, basename, 4); | 
|  |  | 
|  | /* first, keep the same suffix */ | 
|  | tpmap=fopen(filename, "r"); | 
|  | if(tpmap==NULL) { | 
|  | /* next, try reducing the second to last digit by 1 */ | 
|  | --filename[length-2]; | 
|  | tpmap=fopen(filename, "r"); | 
|  | if(tpmap==NULL) { | 
|  | /* there is no TPMAP */ | 
|  | fprintf(stderr, "error: unable to find the TPMAP file for \"%s\"\n", arg); | 
|  | exit(1); | 
|  | } | 
|  | } | 
|  | strcpy(tpname, filename+length-17); | 
|  |  | 
|  | /* parse both files */ | 
|  | fromUMappingsTop=parseMappings(rpmap, fromUMappings); | 
|  | toUMappingsTop=parseMappings(tpmap, toUMappings); | 
|  | fclose(tpmap); | 
|  | fclose(rpmap); | 
|  |  | 
|  | /* if there is no subchar, then try to get it from the corresponding UPMAP */ | 
|  | if(subchar==0) { | 
|  | FILE *f; | 
|  |  | 
|  | /* restore the RPMAP filename and just replace the R by U */ | 
|  | strcpy(filename+length-17, basename); | 
|  | if(filename[length-8]=='R') { | 
|  | filename[length-8]='U'; | 
|  | } else { | 
|  | filename[length-8]='u'; | 
|  | } | 
|  |  | 
|  | f=fopen(filename, "r"); | 
|  | if(f==NULL) { | 
|  | /* try reversing the CCSIDs */ | 
|  | memcpy(filename+length-17, basename+4, 4); | 
|  | memcpy(filename+length-13, basename, 4); | 
|  | f=fopen(filename, "r"); | 
|  | } | 
|  | if(f!=NULL) { | 
|  | getSubcharFromUPMAP(f); | 
|  | fclose(f); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* generate the .ucm filename - necessary before getSubchar() */ | 
|  | length=sprintf(filename, "ibm-%u_", ccsid); | 
|  |  | 
|  | /* uppercase and append the suffix */ | 
|  | filename[length++]=toupper(basename[10]);  /* P or X */ | 
|  | filename[length++]=toupper(basename[14]);  /* last 3 suffix characters */ | 
|  | filename[length++]=toupper(basename[15]); | 
|  | filename[length++]=toupper(basename[16]); | 
|  | filename[length++]='-'; | 
|  | filename[length]=0; | 
|  | /*concatenate year*/ | 
|  | strcat(filename,YEAR); | 
|  | /* find the subchar if still necessary - necessary before merging for correct |2 */ | 
|  | if(subchar==0 && !getSubchar(filename+4)) { | 
|  | fprintf(stderr, "warning: missing subchar in \"%s\" (CCSID=0x%04X)\n", filename, ccsid); | 
|  | } | 
|  |  | 
|  | /* merge the mappings */ | 
|  | mergeMappings(); | 
|  |  | 
|  | /* analyze the conversion table */ | 
|  | analyzeTable(); | 
|  |  | 
|  | /* open the .ucm file */ | 
|  | strcat(filename, ".ucm"); | 
|  | ucm=fopen(filename, "w"); | 
|  | if(ucm==NULL) { | 
|  | fprintf(stderr, "error: unable to open output file \"%s\"\n", filename); | 
|  | exit(4); | 
|  | } | 
|  |  | 
|  | /* remove the .ucm from the filename for the following processing */ | 
|  | filename[strlen(filename)-4]=0; | 
|  |  | 
|  | /* write the .ucm file */ | 
|  | writeUCM(ucm, filename, basename, tpname); | 
|  | fclose(ucm); | 
|  | } | 
|  |  | 
|  | extern int | 
|  | main(int argc, const char *argv[]) { | 
|  | if(argc<2) { | 
|  | fprintf(stderr, | 
|  | "usage: %s { rpmap/rxmap-filename }+\n", | 
|  | argv[0]); | 
|  | exit(1); | 
|  | } | 
|  |  | 
|  | while(--argc>0) { | 
|  | processTable(*++argv); | 
|  | } | 
|  |  | 
|  | return 0; | 
|  | } |